From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/Makeasm.am | 118 ++ gmp-6.3.0/mpn/Makefile | 772 +++++++++ gmp-6.3.0/mpn/Makefile.am | 59 + gmp-6.3.0/mpn/Makefile.in | 772 +++++++++ gmp-6.3.0/mpn/README | 44 + gmp-6.3.0/mpn/add.c | 1 + gmp-6.3.0/mpn/add_1.c | 1 + gmp-6.3.0/mpn/add_err1_n.c | 1 + gmp-6.3.0/mpn/add_err2_n.c | 1 + gmp-6.3.0/mpn/add_err3_n.c | 1 + gmp-6.3.0/mpn/add_n.asm | 1 + gmp-6.3.0/mpn/add_n_sub_n.c | 1 + gmp-6.3.0/mpn/addmul_1.asm | 1 + gmp-6.3.0/mpn/alpha/README | 208 +++ gmp-6.3.0/mpn/alpha/add_n.asm | 164 ++ gmp-6.3.0/mpn/alpha/addmul_1.asm | 99 ++ gmp-6.3.0/mpn/alpha/alpha-defs.m4 | 107 ++ gmp-6.3.0/mpn/alpha/aorslsh1_n.asm | 164 ++ gmp-6.3.0/mpn/alpha/aorslsh2_n.asm | 167 ++ gmp-6.3.0/mpn/alpha/bdiv_dbm1c.asm | 282 ++++ gmp-6.3.0/mpn/alpha/cntlz.asm | 55 + gmp-6.3.0/mpn/alpha/com.asm | 176 ++ gmp-6.3.0/mpn/alpha/copyd.asm | 88 + gmp-6.3.0/mpn/alpha/copyi.asm | 86 + gmp-6.3.0/mpn/alpha/default.m4 | 127 ++ gmp-6.3.0/mpn/alpha/dive_1.c | 114 ++ gmp-6.3.0/mpn/alpha/divrem_2.asm | 177 ++ gmp-6.3.0/mpn/alpha/ev5/diveby3.asm | 332 ++++ gmp-6.3.0/mpn/alpha/ev5/gmp-mparam.h | 191 +++ gmp-6.3.0/mpn/alpha/ev6/add_n.asm | 283 ++++ gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm | 172 ++ gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm | 398 +++++ gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h | 209 +++ gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm | 336 ++++ gmp-6.3.0/mpn/alpha/ev6/mul_1.asm | 496 ++++++ gmp-6.3.0/mpn/alpha/ev6/nails/README | 65 + gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm | 396 +++++ gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm | 146 ++ gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm | 169 ++ gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm | 210 +++ gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm | 233 +++ gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h | 72 + gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm | 364 ++++ gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm | 396 +++++ gmp-6.3.0/mpn/alpha/ev6/slot.pl | 318 ++++ gmp-6.3.0/mpn/alpha/ev6/sub_n.asm | 283 ++++ gmp-6.3.0/mpn/alpha/ev67/gcd_11.asm | 79 + gmp-6.3.0/mpn/alpha/ev67/hamdist.asm | 111 ++ gmp-6.3.0/mpn/alpha/ev67/popcount.asm | 101 ++ gmp-6.3.0/mpn/alpha/gmp-mparam.h | 86 + gmp-6.3.0/mpn/alpha/invert_limb.asm | 95 ++ gmp-6.3.0/mpn/alpha/lshift.asm | 182 ++ gmp-6.3.0/mpn/alpha/mod_34lsub1.asm | 164 ++ gmp-6.3.0/mpn/alpha/mode1o.asm | 209 +++ gmp-6.3.0/mpn/alpha/mul_1.asm | 102 ++ gmp-6.3.0/mpn/alpha/rshift.asm | 180 ++ gmp-6.3.0/mpn/alpha/sec_tabselect.asm | 137 ++ gmp-6.3.0/mpn/alpha/sqr_diag_addlsh1.asm | 93 ++ gmp-6.3.0/mpn/alpha/sub_n.asm | 164 ++ gmp-6.3.0/mpn/alpha/submul_1.asm | 99 ++ gmp-6.3.0/mpn/alpha/umul.asm | 44 + gmp-6.3.0/mpn/alpha/unicos.m4 | 131 ++ gmp-6.3.0/mpn/and_n.c | 1 + gmp-6.3.0/mpn/andn_n.c | 1 + gmp-6.3.0/mpn/arm/README | 35 + gmp-6.3.0/mpn/arm/aors_n.asm | 112 ++ gmp-6.3.0/mpn/arm/aorslsh1_n.asm | 167 ++ gmp-6.3.0/mpn/arm/aorsmul_1.asm | 135 ++ gmp-6.3.0/mpn/arm/arm-defs.m4 | 100 ++ gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm | 113 ++ gmp-6.3.0/mpn/arm/bdiv_q_1.asm | 162 ++ gmp-6.3.0/mpn/arm/cnd_aors_n.asm | 134 ++ gmp-6.3.0/mpn/arm/com.asm | 75 + gmp-6.3.0/mpn/arm/copyd.asm | 84 + gmp-6.3.0/mpn/arm/copyi.asm | 79 + gmp-6.3.0/mpn/arm/dive_1.asm | 151 ++ gmp-6.3.0/mpn/arm/gmp-mparam.h | 127 ++ gmp-6.3.0/mpn/arm/invert_limb.asm | 93 ++ gmp-6.3.0/mpn/arm/logops_n.asm | 139 ++ gmp-6.3.0/mpn/arm/lshift.asm | 88 + gmp-6.3.0/mpn/arm/lshiftc.asm | 95 ++ gmp-6.3.0/mpn/arm/mod_34lsub1.asm | 124 ++ gmp-6.3.0/mpn/arm/mode1o.asm | 92 + gmp-6.3.0/mpn/arm/mul_1.asm | 94 ++ gmp-6.3.0/mpn/arm/neon/README | 2 + gmp-6.3.0/mpn/arm/neon/hamdist.asm | 194 +++ gmp-6.3.0/mpn/arm/neon/lorrshift.asm | 279 ++++ gmp-6.3.0/mpn/arm/neon/lshiftc.asm | 242 +++ gmp-6.3.0/mpn/arm/neon/popcount.asm | 166 ++ gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm | 140 ++ gmp-6.3.0/mpn/arm/rsh1aors_n.asm | 124 ++ gmp-6.3.0/mpn/arm/rshift.asm | 86 + gmp-6.3.0/mpn/arm/sec_tabselect.asm | 131 ++ gmp-6.3.0/mpn/arm/udiv.asm | 104 ++ gmp-6.3.0/mpn/arm/v5/gcd_11.asm | 70 + gmp-6.3.0/mpn/arm/v5/gcd_22.asm | 117 ++ gmp-6.3.0/mpn/arm/v5/mod_1_1.asm | 129 ++ gmp-6.3.0/mpn/arm/v5/mod_1_2.asm | 156 ++ gmp-6.3.0/mpn/arm/v6/addmul_1.asm | 112 ++ gmp-6.3.0/mpn/arm/v6/addmul_2.asm | 125 ++ gmp-6.3.0/mpn/arm/v6/addmul_3.asm | 191 +++ gmp-6.3.0/mpn/arm/v6/dive_1.asm | 149 ++ gmp-6.3.0/mpn/arm/v6/gmp-mparam.h | 187 +++ gmp-6.3.0/mpn/arm/v6/mode1o.asm | 95 ++ gmp-6.3.0/mpn/arm/v6/mul_1.asm | 115 ++ gmp-6.3.0/mpn/arm/v6/mul_2.asm | 135 ++ gmp-6.3.0/mpn/arm/v6/popham.asm | 139 ++ gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm | 544 ++++++ gmp-6.3.0/mpn/arm/v6/submul_1.asm | 125 ++ gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm | 212 +++ gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm | 65 + gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm | 113 ++ gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm | 145 ++ gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm | 162 ++ gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm | 36 + gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm | 158 ++ gmp-6.3.0/mpn/arm/v7a/cora15/com.asm | 180 ++ gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h | 212 +++ gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm | 253 +++ gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm | 104 ++ .../mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm | 43 + .../mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm | 43 + .../mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm | 144 ++ gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm | 97 ++ gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm | 110 ++ gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm | 90 + gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm | 177 ++ gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm | 159 ++ gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm | 34 + gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h | 233 +++ gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm | 121 ++ gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm | 34 + gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm | 34 + gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h | 205 +++ gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h | 202 +++ gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm | 158 ++ gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h | 207 +++ gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm | 36 + gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h | 211 +++ gmp-6.3.0/mpn/arm64/aors_n.asm | 125 ++ gmp-6.3.0/mpn/arm64/aorsmul_1.asm | 145 ++ gmp-6.3.0/mpn/arm64/aorsorrlsh1_n.asm | 43 + gmp-6.3.0/mpn/arm64/aorsorrlsh2_n.asm | 43 + gmp-6.3.0/mpn/arm64/aorsorrlshC_n.asm | 139 ++ gmp-6.3.0/mpn/arm64/applem1/addaddmul_1msb0.asm | 92 + gmp-6.3.0/mpn/arm64/applem1/aorsmul_1.asm | 161 ++ gmp-6.3.0/mpn/arm64/applem1/gmp-mparam.h | 187 +++ gmp-6.3.0/mpn/arm64/applem1/sqr_basecase.asm | 318 ++++ gmp-6.3.0/mpn/arm64/arm64-defs.m4 | 53 + gmp-6.3.0/mpn/arm64/bdiv_dbm1c.asm | 111 ++ gmp-6.3.0/mpn/arm64/bdiv_q_1.asm | 122 ++ gmp-6.3.0/mpn/arm64/cnd_aors_n.asm | 129 ++ gmp-6.3.0/mpn/arm64/com.asm | 92 + gmp-6.3.0/mpn/arm64/copyd.asm | 85 + gmp-6.3.0/mpn/arm64/copyi.asm | 82 + gmp-6.3.0/mpn/arm64/cora53/cnd_aors_n.asm | 99 ++ gmp-6.3.0/mpn/arm64/cora53/gmp-mparam.h | 242 +++ gmp-6.3.0/mpn/arm64/cora57/gmp-mparam.h | 188 +++ gmp-6.3.0/mpn/arm64/cora72/gmp-mparam.h | 242 +++ gmp-6.3.0/mpn/arm64/cora73/gmp-mparam.h | 225 +++ gmp-6.3.0/mpn/arm64/darwin.m4 | 50 + gmp-6.3.0/mpn/arm64/divrem_1.asm | 231 +++ gmp-6.3.0/mpn/arm64/gcd_11.asm | 70 + gmp-6.3.0/mpn/arm64/gcd_22.asm | 112 ++ gmp-6.3.0/mpn/arm64/gmp-mparam.h | 192 +++ gmp-6.3.0/mpn/arm64/hamdist.asm | 181 ++ gmp-6.3.0/mpn/arm64/invert_limb.asm | 83 + gmp-6.3.0/mpn/arm64/logops_n.asm | 139 ++ gmp-6.3.0/mpn/arm64/lshift.asm | 138 ++ gmp-6.3.0/mpn/arm64/lshiftc.asm | 141 ++ gmp-6.3.0/mpn/arm64/mod_34lsub1.asm | 124 ++ gmp-6.3.0/mpn/arm64/mul_1.asm | 128 ++ gmp-6.3.0/mpn/arm64/popcount.asm | 157 ++ gmp-6.3.0/mpn/arm64/rsh1aors_n.asm | 168 ++ gmp-6.3.0/mpn/arm64/rshift.asm | 136 ++ gmp-6.3.0/mpn/arm64/sec_tabselect.asm | 122 ++ gmp-6.3.0/mpn/arm64/sqr_diag_addlsh1.asm | 102 ++ gmp-6.3.0/mpn/arm64/xgene1/gmp-mparam.h | 182 ++ gmp-6.3.0/mpn/asm-defs.m4 | 1766 ++++++++++++++++++++ gmp-6.3.0/mpn/bdiv_dbm1c.asm | 1 + gmp-6.3.0/mpn/bdiv_q.c | 1 + gmp-6.3.0/mpn/bdiv_q_1.asm | 1 + gmp-6.3.0/mpn/bdiv_qr.c | 1 + gmp-6.3.0/mpn/binvert.c | 1 + gmp-6.3.0/mpn/broot.c | 1 + gmp-6.3.0/mpn/brootinv.c | 1 + gmp-6.3.0/mpn/bsqrt.c | 1 + gmp-6.3.0/mpn/bsqrtinv.c | 1 + gmp-6.3.0/mpn/cmp.c | 1 + gmp-6.3.0/mpn/cnd_add_n.asm | 1 + gmp-6.3.0/mpn/cnd_sub_n.asm | 1 + gmp-6.3.0/mpn/cnd_swap.c | 1 + gmp-6.3.0/mpn/com.c | 1 + gmp-6.3.0/mpn/comb_tables.c | 1 + gmp-6.3.0/mpn/compute_powtab.c | 1 + gmp-6.3.0/mpn/copyd.asm | 1 + gmp-6.3.0/mpn/copyi.asm | 1 + gmp-6.3.0/mpn/cpp-ccas | 118 ++ gmp-6.3.0/mpn/cray/README | 121 ++ gmp-6.3.0/mpn/cray/add_n.c | 90 + gmp-6.3.0/mpn/cray/cfp/addmul_1.c | 48 + gmp-6.3.0/mpn/cray/cfp/mul_1.c | 47 + gmp-6.3.0/mpn/cray/cfp/mulwwc90.s | 254 +++ gmp-6.3.0/mpn/cray/cfp/mulwwj90.s | 253 +++ gmp-6.3.0/mpn/cray/cfp/submul_1.c | 48 + gmp-6.3.0/mpn/cray/gmp-mparam.h | 74 + gmp-6.3.0/mpn/cray/hamdist.c | 42 + gmp-6.3.0/mpn/cray/ieee/addmul_1.c | 111 ++ gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h | 73 + gmp-6.3.0/mpn/cray/ieee/invert_limb.c | 127 ++ gmp-6.3.0/mpn/cray/ieee/mul_1.c | 103 ++ gmp-6.3.0/mpn/cray/ieee/mul_basecase.c | 107 ++ gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c | 105 ++ gmp-6.3.0/mpn/cray/ieee/submul_1.c | 111 ++ gmp-6.3.0/mpn/cray/lshift.c | 58 + gmp-6.3.0/mpn/cray/mulww.f | 63 + gmp-6.3.0/mpn/cray/popcount.c | 42 + gmp-6.3.0/mpn/cray/rshift.c | 58 + gmp-6.3.0/mpn/cray/sub_n.c | 90 + gmp-6.3.0/mpn/dcpi1_bdiv_q.c | 1 + gmp-6.3.0/mpn/dcpi1_bdiv_qr.c | 1 + gmp-6.3.0/mpn/dcpi1_div_q.c | 1 + gmp-6.3.0/mpn/dcpi1_div_qr.c | 1 + gmp-6.3.0/mpn/dcpi1_divappr_q.c | 1 + gmp-6.3.0/mpn/div_q.c | 1 + gmp-6.3.0/mpn/div_qr_1.c | 1 + gmp-6.3.0/mpn/div_qr_1n_pi1.c | 1 + gmp-6.3.0/mpn/div_qr_2.c | 1 + gmp-6.3.0/mpn/div_qr_2n_pi1.c | 1 + gmp-6.3.0/mpn/div_qr_2u_pi1.c | 1 + gmp-6.3.0/mpn/dive_1.asm | 1 + gmp-6.3.0/mpn/diveby3.c | 1 + gmp-6.3.0/mpn/divexact.c | 1 + gmp-6.3.0/mpn/divis.c | 1 + gmp-6.3.0/mpn/divrem.c | 1 + gmp-6.3.0/mpn/divrem_1.asm | 1 + gmp-6.3.0/mpn/divrem_2.asm | 1 + gmp-6.3.0/mpn/dump.c | 1 + gmp-6.3.0/mpn/fib2_ui.c | 1 + gmp-6.3.0/mpn/fib2m.c | 1 + gmp-6.3.0/mpn/fib_table.c | 61 + gmp-6.3.0/mpn/gcd.c | 1 + gmp-6.3.0/mpn/gcd_1.c | 1 + gmp-6.3.0/mpn/gcd_11.asm | 1 + gmp-6.3.0/mpn/gcd_22.c | 1 + gmp-6.3.0/mpn/gcd_subdiv_step.c | 1 + gmp-6.3.0/mpn/gcdext.c | 1 + gmp-6.3.0/mpn/gcdext_1.c | 1 + gmp-6.3.0/mpn/gcdext_lehmer.c | 1 + gmp-6.3.0/mpn/generic/add.c | 33 + gmp-6.3.0/mpn/generic/add_1.c | 33 + gmp-6.3.0/mpn/generic/add_err1_n.c | 100 ++ gmp-6.3.0/mpn/generic/add_err2_n.c | 116 ++ gmp-6.3.0/mpn/generic/add_err3_n.c | 131 ++ gmp-6.3.0/mpn/generic/add_n.c | 89 + gmp-6.3.0/mpn/generic/add_n_sub_n.c | 172 ++ gmp-6.3.0/mpn/generic/addmul_1.c | 145 ++ gmp-6.3.0/mpn/generic/bdiv_dbm1c.c | 58 + gmp-6.3.0/mpn/generic/bdiv_q.c | 76 + gmp-6.3.0/mpn/generic/bdiv_q_1.c | 121 ++ gmp-6.3.0/mpn/generic/bdiv_qr.c | 84 + gmp-6.3.0/mpn/generic/binvert.c | 106 ++ gmp-6.3.0/mpn/generic/broot.c | 195 +++ gmp-6.3.0/mpn/generic/brootinv.c | 159 ++ gmp-6.3.0/mpn/generic/bsqrt.c | 47 + gmp-6.3.0/mpn/generic/bsqrtinv.c | 103 ++ gmp-6.3.0/mpn/generic/cmp.c | 33 + gmp-6.3.0/mpn/generic/cnd_add_n.c | 69 + gmp-6.3.0/mpn/generic/cnd_sub_n.c | 69 + gmp-6.3.0/mpn/generic/cnd_swap.c | 50 + gmp-6.3.0/mpn/generic/com.c | 44 + gmp-6.3.0/mpn/generic/comb_tables.c | 47 + gmp-6.3.0/mpn/generic/compute_powtab.c | 373 +++++ gmp-6.3.0/mpn/generic/copyd.c | 40 + gmp-6.3.0/mpn/generic/copyi.c | 42 + gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c | 161 ++ gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c | 176 ++ gmp-6.3.0/mpn/generic/dcpi1_div_q.c | 86 + gmp-6.3.0/mpn/generic/dcpi1_div_qr.c | 248 +++ gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c | 256 +++ gmp-6.3.0/mpn/generic/div_q.c | 313 ++++ gmp-6.3.0/mpn/generic/div_qr_1.c | 125 ++ gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c | 505 ++++++ gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c | 203 +++ gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c | 236 +++ gmp-6.3.0/mpn/generic/div_qr_2.c | 314 ++++ gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c | 84 + gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c | 76 + gmp-6.3.0/mpn/generic/dive_1.c | 146 ++ gmp-6.3.0/mpn/generic/diveby3.c | 173 ++ gmp-6.3.0/mpn/generic/divexact.c | 296 ++++ gmp-6.3.0/mpn/generic/divis.c | 194 +++ gmp-6.3.0/mpn/generic/divrem.c | 103 ++ gmp-6.3.0/mpn/generic/divrem_1.c | 254 +++ gmp-6.3.0/mpn/generic/divrem_2.c | 118 ++ gmp-6.3.0/mpn/generic/dump.c | 99 ++ gmp-6.3.0/mpn/generic/fib2_ui.c | 174 ++ gmp-6.3.0/mpn/generic/fib2m.c | 252 +++ gmp-6.3.0/mpn/generic/gcd.c | 266 +++ gmp-6.3.0/mpn/generic/gcd_1.c | 103 ++ gmp-6.3.0/mpn/generic/gcd_11.c | 74 + gmp-6.3.0/mpn/generic/gcd_22.c | 131 ++ gmp-6.3.0/mpn/generic/gcd_subdiv_step.c | 204 +++ gmp-6.3.0/mpn/generic/gcdext.c | 557 ++++++ gmp-6.3.0/mpn/generic/gcdext_1.c | 275 +++ gmp-6.3.0/mpn/generic/gcdext_lehmer.c | 336 ++++ gmp-6.3.0/mpn/generic/get_d.c | 438 +++++ gmp-6.3.0/mpn/generic/get_str.c | 451 +++++ gmp-6.3.0/mpn/generic/gmp-mparam.h | 33 + gmp-6.3.0/mpn/generic/hgcd.c | 182 ++ gmp-6.3.0/mpn/generic/hgcd2-div.h | 504 ++++++ gmp-6.3.0/mpn/generic/hgcd2.c | 283 ++++ gmp-6.3.0/mpn/generic/hgcd2_jacobi.c | 251 +++ gmp-6.3.0/mpn/generic/hgcd_appr.c | 267 +++ gmp-6.3.0/mpn/generic/hgcd_jacobi.c | 243 +++ gmp-6.3.0/mpn/generic/hgcd_matrix.c | 265 +++ gmp-6.3.0/mpn/generic/hgcd_reduce.c | 242 +++ gmp-6.3.0/mpn/generic/hgcd_step.c | 127 ++ gmp-6.3.0/mpn/generic/invert.c | 86 + gmp-6.3.0/mpn/generic/invertappr.c | 300 ++++ gmp-6.3.0/mpn/generic/jacbase.c | 242 +++ gmp-6.3.0/mpn/generic/jacobi.c | 294 ++++ gmp-6.3.0/mpn/generic/jacobi_2.c | 351 ++++ gmp-6.3.0/mpn/generic/logops_n.c | 77 + gmp-6.3.0/mpn/generic/lshift.c | 72 + gmp-6.3.0/mpn/generic/lshiftc.c | 73 + gmp-6.3.0/mpn/generic/matrix22_mul.c | 321 ++++ .../mpn/generic/matrix22_mul1_inverse_vector.c | 64 + gmp-6.3.0/mpn/generic/mod_1.c | 278 +++ gmp-6.3.0/mpn/generic/mod_1_1.c | 341 ++++ gmp-6.3.0/mpn/generic/mod_1_2.c | 148 ++ gmp-6.3.0/mpn/generic/mod_1_3.c | 155 ++ gmp-6.3.0/mpn/generic/mod_1_4.c | 170 ++ gmp-6.3.0/mpn/generic/mod_34lsub1.c | 128 ++ gmp-6.3.0/mpn/generic/mode1o.c | 235 +++ gmp-6.3.0/mpn/generic/mu_bdiv_q.c | 281 ++++ gmp-6.3.0/mpn/generic/mu_bdiv_qr.c | 312 ++++ gmp-6.3.0/mpn/generic/mu_div_q.c | 184 ++ gmp-6.3.0/mpn/generic/mu_div_qr.c | 417 +++++ gmp-6.3.0/mpn/generic/mu_divappr_q.c | 368 ++++ gmp-6.3.0/mpn/generic/mul.c | 441 +++++ gmp-6.3.0/mpn/generic/mul_1.c | 96 ++ gmp-6.3.0/mpn/generic/mul_basecase.c | 165 ++ gmp-6.3.0/mpn/generic/mul_fft.c | 1105 ++++++++++++ gmp-6.3.0/mpn/generic/mul_n.c | 96 ++ gmp-6.3.0/mpn/generic/mullo_basecase.c | 90 + gmp-6.3.0/mpn/generic/mullo_n.c | 243 +++ gmp-6.3.0/mpn/generic/mulmid.c | 255 +++ gmp-6.3.0/mpn/generic/mulmid_basecase.c | 82 + gmp-6.3.0/mpn/generic/mulmid_n.c | 61 + gmp-6.3.0/mpn/generic/mulmod_bknp1.c | 502 ++++++ gmp-6.3.0/mpn/generic/mulmod_bnm1.c | 374 +++++ gmp-6.3.0/mpn/generic/neg.c | 33 + gmp-6.3.0/mpn/generic/nussbaumer_mul.c | 70 + gmp-6.3.0/mpn/generic/perfpow.c | 342 ++++ gmp-6.3.0/mpn/generic/perfsqr.c | 238 +++ gmp-6.3.0/mpn/generic/popham.c | 125 ++ gmp-6.3.0/mpn/generic/pow_1.c | 135 ++ gmp-6.3.0/mpn/generic/powlo.c | 188 +++ gmp-6.3.0/mpn/generic/powm.c | 1003 +++++++++++ gmp-6.3.0/mpn/generic/pre_divrem_1.c | 145 ++ gmp-6.3.0/mpn/generic/pre_mod_1.c | 61 + gmp-6.3.0/mpn/generic/random.c | 50 + gmp-6.3.0/mpn/generic/random2.c | 105 ++ gmp-6.3.0/mpn/generic/redc_1.c | 56 + gmp-6.3.0/mpn/generic/redc_2.c | 110 ++ gmp-6.3.0/mpn/generic/redc_n.c | 80 + gmp-6.3.0/mpn/generic/remove.c | 182 ++ gmp-6.3.0/mpn/generic/rootrem.c | 515 ++++++ gmp-6.3.0/mpn/generic/rshift.c | 69 + gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c | 96 ++ gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c | 82 + gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c | 79 + gmp-6.3.0/mpn/generic/sbpi1_div_q.c | 302 ++++ gmp-6.3.0/mpn/generic/sbpi1_div_qr.c | 109 ++ gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c | 198 +++ gmp-6.3.0/mpn/generic/scan0.c | 59 + gmp-6.3.0/mpn/generic/scan1.c | 59 + gmp-6.3.0/mpn/generic/sec_aors_1.c | 59 + gmp-6.3.0/mpn/generic/sec_div.c | 131 ++ gmp-6.3.0/mpn/generic/sec_invert.c | 177 ++ gmp-6.3.0/mpn/generic/sec_mul.c | 48 + gmp-6.3.0/mpn/generic/sec_pi1_div.c | 172 ++ gmp-6.3.0/mpn/generic/sec_powm.c | 430 +++++ gmp-6.3.0/mpn/generic/sec_sqr.c | 76 + gmp-6.3.0/mpn/generic/sec_tabselect.c | 134 ++ gmp-6.3.0/mpn/generic/set_str.c | 290 ++++ gmp-6.3.0/mpn/generic/sizeinbase.c | 49 + gmp-6.3.0/mpn/generic/sqr.c | 98 ++ gmp-6.3.0/mpn/generic/sqr_basecase.c | 361 ++++ gmp-6.3.0/mpn/generic/sqrlo.c | 239 +++ gmp-6.3.0/mpn/generic/sqrlo_basecase.c | 194 +++ gmp-6.3.0/mpn/generic/sqrmod_bnm1.c | 328 ++++ gmp-6.3.0/mpn/generic/sqrtrem.c | 555 ++++++ gmp-6.3.0/mpn/generic/strongfibo.c | 219 +++ gmp-6.3.0/mpn/generic/sub.c | 33 + gmp-6.3.0/mpn/generic/sub_1.c | 33 + gmp-6.3.0/mpn/generic/sub_err1_n.c | 100 ++ gmp-6.3.0/mpn/generic/sub_err2_n.c | 116 ++ gmp-6.3.0/mpn/generic/sub_err3_n.c | 131 ++ gmp-6.3.0/mpn/generic/sub_n.c | 89 + gmp-6.3.0/mpn/generic/submul_1.c | 144 ++ gmp-6.3.0/mpn/generic/tdiv_qr.c | 386 +++++ gmp-6.3.0/mpn/generic/toom22_mul.c | 222 +++ gmp-6.3.0/mpn/generic/toom2_sqr.c | 155 ++ gmp-6.3.0/mpn/generic/toom32_mul.c | 320 ++++ gmp-6.3.0/mpn/generic/toom33_mul.c | 316 ++++ gmp-6.3.0/mpn/generic/toom3_sqr.c | 221 +++ gmp-6.3.0/mpn/generic/toom42_mul.c | 234 +++ gmp-6.3.0/mpn/generic/toom42_mulmid.c | 237 +++ gmp-6.3.0/mpn/generic/toom43_mul.c | 238 +++ gmp-6.3.0/mpn/generic/toom44_mul.c | 239 +++ gmp-6.3.0/mpn/generic/toom4_sqr.c | 164 ++ gmp-6.3.0/mpn/generic/toom52_mul.c | 256 +++ gmp-6.3.0/mpn/generic/toom53_mul.c | 331 ++++ gmp-6.3.0/mpn/generic/toom54_mul.c | 142 ++ gmp-6.3.0/mpn/generic/toom62_mul.c | 310 ++++ gmp-6.3.0/mpn/generic/toom63_mul.c | 231 +++ gmp-6.3.0/mpn/generic/toom6_sqr.c | 181 ++ gmp-6.3.0/mpn/generic/toom6h_mul.c | 262 +++ gmp-6.3.0/mpn/generic/toom8_sqr.c | 225 +++ gmp-6.3.0/mpn/generic/toom8h_mul.c | 305 ++++ gmp-6.3.0/mpn/generic/toom_couple_handling.c | 80 + gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c | 72 + gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c | 97 ++ gmp-6.3.0/mpn/generic/toom_eval_pm1.c | 89 + gmp-6.3.0/mpn/generic/toom_eval_pm2.c | 130 ++ gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c | 127 ++ gmp-6.3.0/mpn/generic/toom_eval_pm2rexp.c | 101 ++ gmp-6.3.0/mpn/generic/toom_interpolate_12pts.c | 374 +++++ gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c | 545 ++++++ gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c | 198 +++ gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c | 241 +++ gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c | 274 +++ gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c | 211 +++ gmp-6.3.0/mpn/generic/trialdiv.c | 131 ++ gmp-6.3.0/mpn/generic/udiv_w_sdiv.c | 141 ++ gmp-6.3.0/mpn/generic/zero.c | 41 + gmp-6.3.0/mpn/generic/zero_p.c | 33 + gmp-6.3.0/mpn/get_d.c | 1 + gmp-6.3.0/mpn/get_str.c | 1 + gmp-6.3.0/mpn/hamdist.asm | 1 + gmp-6.3.0/mpn/hgcd.c | 1 + gmp-6.3.0/mpn/hgcd2.c | 1 + gmp-6.3.0/mpn/hgcd2_jacobi.c | 1 + gmp-6.3.0/mpn/hgcd_appr.c | 1 + gmp-6.3.0/mpn/hgcd_jacobi.c | 1 + gmp-6.3.0/mpn/hgcd_matrix.c | 1 + gmp-6.3.0/mpn/hgcd_reduce.c | 1 + gmp-6.3.0/mpn/hgcd_step.c | 1 + gmp-6.3.0/mpn/ia64/README | 281 ++++ gmp-6.3.0/mpn/ia64/add_n_sub_n.asm | 307 ++++ gmp-6.3.0/mpn/ia64/addmul_1.asm | 602 +++++++ gmp-6.3.0/mpn/ia64/addmul_2.asm | 715 ++++++++ gmp-6.3.0/mpn/ia64/aors_n.asm | 852 ++++++++++ gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm | 48 + gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm | 48 + gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm | 412 +++++ gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm | 516 ++++++ gmp-6.3.0/mpn/ia64/cnd_aors_n.asm | 264 +++ gmp-6.3.0/mpn/ia64/copyd.asm | 186 +++ gmp-6.3.0/mpn/ia64/copyi.asm | 182 ++ gmp-6.3.0/mpn/ia64/dive_1.asm | 236 +++ gmp-6.3.0/mpn/ia64/divrem_1.asm | 477 ++++++ gmp-6.3.0/mpn/ia64/divrem_2.asm | 280 ++++ gmp-6.3.0/mpn/ia64/gcd_11.asm | 110 ++ gmp-6.3.0/mpn/ia64/gmp-mparam.h | 212 +++ gmp-6.3.0/mpn/ia64/hamdist.asm | 365 ++++ gmp-6.3.0/mpn/ia64/ia64-defs.m4 | 147 ++ gmp-6.3.0/mpn/ia64/invert_limb.asm | 105 ++ gmp-6.3.0/mpn/ia64/logops_n.asm | 292 ++++ gmp-6.3.0/mpn/ia64/lorrshift.asm | 358 ++++ gmp-6.3.0/mpn/ia64/lshiftc.asm | 463 +++++ gmp-6.3.0/mpn/ia64/mod_34lsub1.asm | 237 +++ gmp-6.3.0/mpn/ia64/mode1o.asm | 342 ++++ gmp-6.3.0/mpn/ia64/mul_1.asm | 584 +++++++ gmp-6.3.0/mpn/ia64/mul_2.asm | 625 +++++++ gmp-6.3.0/mpn/ia64/popcount.asm | 200 +++ gmp-6.3.0/mpn/ia64/rsh1aors_n.asm | 447 +++++ gmp-6.3.0/mpn/ia64/sec_tabselect.asm | 148 ++ gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm | 156 ++ gmp-6.3.0/mpn/ia64/submul_1.asm | 647 +++++++ gmp-6.3.0/mpn/invert.c | 1 + gmp-6.3.0/mpn/invertappr.c | 1 + gmp-6.3.0/mpn/ior_n.c | 1 + gmp-6.3.0/mpn/iorn_n.c | 1 + gmp-6.3.0/mpn/jacbase.c | 1 + gmp-6.3.0/mpn/jacobi.c | 1 + gmp-6.3.0/mpn/jacobi_2.c | 1 + gmp-6.3.0/mpn/jacobitab.h | 13 + gmp-6.3.0/mpn/lisp/gmpasm-mode.el | 385 +++++ gmp-6.3.0/mpn/loongarch/64/add_n.asm | 64 + gmp-6.3.0/mpn/loongarch/64/aorslsh1_n.asm | 50 + gmp-6.3.0/mpn/loongarch/64/aorslsh2_n.asm | 50 + gmp-6.3.0/mpn/loongarch/64/aorslshC_n.asm | 116 ++ gmp-6.3.0/mpn/loongarch/64/aorsmul_1.asm | 120 ++ gmp-6.3.0/mpn/loongarch/64/cnd_aors_n.asm | 99 ++ gmp-6.3.0/mpn/loongarch/64/copyd.asm | 75 + gmp-6.3.0/mpn/loongarch/64/copyi.asm | 73 + gmp-6.3.0/mpn/loongarch/64/lshift.asm | 120 ++ gmp-6.3.0/mpn/loongarch/64/mul_1.asm | 97 ++ gmp-6.3.0/mpn/loongarch/64/rshift.asm | 119 ++ gmp-6.3.0/mpn/loongarch/64/sub_n.asm | 106 ++ gmp-6.3.0/mpn/lshift.asm | 1 + gmp-6.3.0/mpn/lshiftc.c | 1 + gmp-6.3.0/mpn/m4-ccas | 107 ++ gmp-6.3.0/mpn/m68k/README | 138 ++ gmp-6.3.0/mpn/m68k/aors_n.asm | 99 ++ gmp-6.3.0/mpn/m68k/gmp-mparam.h | 76 + gmp-6.3.0/mpn/m68k/lshift.asm | 175 ++ gmp-6.3.0/mpn/m68k/m68k-defs.m4 | 230 +++ gmp-6.3.0/mpn/m68k/mc68020/aorsmul_1.asm | 101 ++ gmp-6.3.0/mpn/m68k/mc68020/mul_1.asm | 96 ++ gmp-6.3.0/mpn/m68k/mc68020/udiv.asm | 45 + gmp-6.3.0/mpn/m68k/mc68020/umul.asm | 44 + gmp-6.3.0/mpn/m68k/rshift.asm | 175 ++ gmp-6.3.0/mpn/m68k/t-m68k-defs.pl | 91 + gmp-6.3.0/mpn/m88k/README | 61 + gmp-6.3.0/mpn/m88k/add_n.s | 113 ++ gmp-6.3.0/mpn/m88k/mc88110/add_n.S | 209 +++ gmp-6.3.0/mpn/m88k/mc88110/addmul_1.s | 70 + gmp-6.3.0/mpn/m88k/mc88110/mul_1.s | 68 + gmp-6.3.0/mpn/m88k/mc88110/sub_n.S | 285 ++++ gmp-6.3.0/mpn/m88k/mul_1.s | 136 ++ gmp-6.3.0/mpn/m88k/sub_n.s | 115 ++ gmp-6.3.0/mpn/matrix22_mul.c | 1 + gmp-6.3.0/mpn/matrix22_mul1_inverse_vector.c | 1 + gmp-6.3.0/mpn/minithres/gmp-mparam.h | 113 ++ gmp-6.3.0/mpn/mips32/add_n.asm | 124 ++ gmp-6.3.0/mpn/mips32/addmul_1.asm | 101 ++ gmp-6.3.0/mpn/mips32/gmp-mparam.h | 72 + gmp-6.3.0/mpn/mips32/lshift.asm | 99 ++ gmp-6.3.0/mpn/mips32/mips-defs.m4 | 80 + gmp-6.3.0/mpn/mips32/mips.m4 | 80 + gmp-6.3.0/mpn/mips32/mul_1.asm | 89 + gmp-6.3.0/mpn/mips32/rshift.asm | 96 ++ gmp-6.3.0/mpn/mips32/sub_n.asm | 123 ++ gmp-6.3.0/mpn/mips32/submul_1.asm | 101 ++ gmp-6.3.0/mpn/mips32/umul.asm | 45 + gmp-6.3.0/mpn/mips64/README | 60 + gmp-6.3.0/mpn/mips64/add_n.asm | 134 ++ gmp-6.3.0/mpn/mips64/gmp-mparam.h | 72 + gmp-6.3.0/mpn/mips64/hilo/addmul_1.asm | 101 ++ gmp-6.3.0/mpn/mips64/hilo/mul_1.asm | 92 + gmp-6.3.0/mpn/mips64/hilo/sqr_diagonal.asm | 77 + gmp-6.3.0/mpn/mips64/hilo/submul_1.asm | 101 ++ gmp-6.3.0/mpn/mips64/hilo/umul.asm | 45 + gmp-6.3.0/mpn/mips64/lshift.asm | 99 ++ gmp-6.3.0/mpn/mips64/rshift.asm | 96 ++ gmp-6.3.0/mpn/mips64/sub_n.asm | 134 ++ gmp-6.3.0/mpn/mod_1.c | 1 + gmp-6.3.0/mpn/mod_1_1.asm | 1 + gmp-6.3.0/mpn/mod_1_2.c | 1 + gmp-6.3.0/mpn/mod_1_3.c | 1 + gmp-6.3.0/mpn/mod_1_4.asm | 1 + gmp-6.3.0/mpn/mod_34lsub1.asm | 1 + gmp-6.3.0/mpn/mode1o.asm | 1 + gmp-6.3.0/mpn/mp_bases.c | 268 +++ gmp-6.3.0/mpn/mu_bdiv_q.c | 1 + gmp-6.3.0/mpn/mu_bdiv_qr.c | 1 + gmp-6.3.0/mpn/mu_div_q.c | 1 + gmp-6.3.0/mpn/mu_div_qr.c | 1 + gmp-6.3.0/mpn/mu_divappr_q.c | 1 + gmp-6.3.0/mpn/mul.c | 1 + gmp-6.3.0/mpn/mul_1.asm | 1 + gmp-6.3.0/mpn/mul_basecase.asm | 1 + gmp-6.3.0/mpn/mul_fft.c | 1 + gmp-6.3.0/mpn/mul_n.c | 1 + gmp-6.3.0/mpn/mullo_basecase.c | 1 + gmp-6.3.0/mpn/mullo_n.c | 1 + gmp-6.3.0/mpn/mulmid.c | 1 + gmp-6.3.0/mpn/mulmid_basecase.c | 1 + gmp-6.3.0/mpn/mulmid_n.c | 1 + gmp-6.3.0/mpn/mulmod_bknp1.c | 1 + gmp-6.3.0/mpn/mulmod_bnm1.c | 1 + gmp-6.3.0/mpn/nand_n.c | 1 + gmp-6.3.0/mpn/neg.c | 1 + gmp-6.3.0/mpn/nior_n.c | 1 + gmp-6.3.0/mpn/nussbaumer_mul.c | 1 + gmp-6.3.0/mpn/pa32/README | 162 ++ gmp-6.3.0/mpn/pa32/add_n.asm | 63 + gmp-6.3.0/mpn/pa32/gmp-mparam.h | 61 + gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm | 106 ++ gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h | 72 + gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm | 102 ++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm | 83 + gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm | 201 +++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm | 95 ++ gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm | 92 + gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm | 84 + gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm | 207 +++ gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm | 60 + gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm | 115 ++ gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm | 102 ++ gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm | 47 + gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm | 107 ++ gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h | 167 ++ gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm | 112 ++ gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm | 107 ++ gmp-6.3.0/mpn/pa32/lshift.asm | 75 + gmp-6.3.0/mpn/pa32/pa-defs.m4 | 64 + gmp-6.3.0/mpn/pa32/rshift.asm | 72 + gmp-6.3.0/mpn/pa32/sub_n.asm | 64 + gmp-6.3.0/mpn/pa32/udiv.asm | 291 ++++ gmp-6.3.0/mpn/pa64/README | 78 + gmp-6.3.0/mpn/pa64/addmul_1.asm | 693 ++++++++ gmp-6.3.0/mpn/pa64/aors_n.asm | 130 ++ gmp-6.3.0/mpn/pa64/aorslsh1_n.asm | 228 +++ gmp-6.3.0/mpn/pa64/gmp-mparam.h | 247 +++ gmp-6.3.0/mpn/pa64/lshift.asm | 114 ++ gmp-6.3.0/mpn/pa64/mul_1.asm | 646 +++++++ gmp-6.3.0/mpn/pa64/rshift.asm | 111 ++ gmp-6.3.0/mpn/pa64/sqr_diagonal.asm | 191 +++ gmp-6.3.0/mpn/pa64/submul_1.asm | 700 ++++++++ gmp-6.3.0/mpn/pa64/udiv.asm | 125 ++ gmp-6.3.0/mpn/pa64/umul.asm | 97 ++ gmp-6.3.0/mpn/perfpow.c | 1 + gmp-6.3.0/mpn/perfsqr.c | 1 + gmp-6.3.0/mpn/perfsqr.h | 50 + gmp-6.3.0/mpn/popcount.asm | 1 + gmp-6.3.0/mpn/pow_1.c | 1 + gmp-6.3.0/mpn/power/add_n.asm | 83 + gmp-6.3.0/mpn/power/addmul_1.asm | 126 ++ gmp-6.3.0/mpn/power/gmp-mparam.h | 69 + gmp-6.3.0/mpn/power/lshift.asm | 61 + gmp-6.3.0/mpn/power/mul_1.asm | 113 ++ gmp-6.3.0/mpn/power/rshift.asm | 59 + gmp-6.3.0/mpn/power/sdiv.asm | 39 + gmp-6.3.0/mpn/power/sub_n.asm | 85 + gmp-6.3.0/mpn/power/submul_1.asm | 131 ++ gmp-6.3.0/mpn/power/umul.asm | 43 + gmp-6.3.0/mpn/powerpc32/750/com.asm | 79 + gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h | 192 +++ gmp-6.3.0/mpn/powerpc32/750/lshift.asm | 155 ++ gmp-6.3.0/mpn/powerpc32/750/rshift.asm | 153 ++ gmp-6.3.0/mpn/powerpc32/README | 180 ++ gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm | 100 ++ gmp-6.3.0/mpn/powerpc32/addmul_1.asm | 159 ++ gmp-6.3.0/mpn/powerpc32/aix.m4 | 82 + gmp-6.3.0/mpn/powerpc32/aors_n.asm | 157 ++ gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm | 131 ++ gmp-6.3.0/mpn/powerpc32/darwin.m4 | 91 + gmp-6.3.0/mpn/powerpc32/diveby3.asm | 93 ++ gmp-6.3.0/mpn/powerpc32/divrem_2.asm | 182 ++ gmp-6.3.0/mpn/powerpc32/eabi.m4 | 86 + gmp-6.3.0/mpn/powerpc32/elf.m4 | 100 ++ gmp-6.3.0/mpn/powerpc32/gmp-mparam.h | 222 +++ gmp-6.3.0/mpn/powerpc32/invert_limb.asm | 142 ++ gmp-6.3.0/mpn/powerpc32/lshift.asm | 168 ++ gmp-6.3.0/mpn/powerpc32/lshiftc.asm | 170 ++ gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm | 145 ++ gmp-6.3.0/mpn/powerpc32/mode1o.asm | 127 ++ gmp-6.3.0/mpn/powerpc32/mul_1.asm | 101 ++ gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm | 187 +++ gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h | 155 ++ gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h | 209 +++ gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h | 156 ++ gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h | 165 ++ gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h | 170 ++ gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 | 128 ++ gmp-6.3.0/mpn/powerpc32/rshift.asm | 166 ++ gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm | 143 ++ gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm | 80 + gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm | 101 ++ gmp-6.3.0/mpn/powerpc32/submul_1.asm | 151 ++ gmp-6.3.0/mpn/powerpc32/umul.asm | 50 + gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm | 203 +++ gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm | 198 +++ gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm | 310 ++++ gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm | 388 +++++ gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm | 34 + gmp-6.3.0/mpn/powerpc64/README | 166 ++ gmp-6.3.0/mpn/powerpc64/aix.m4 | 99 ++ gmp-6.3.0/mpn/powerpc64/com.asm | 136 ++ gmp-6.3.0/mpn/powerpc64/copyd.asm | 84 + gmp-6.3.0/mpn/powerpc64/copyi.asm | 78 + gmp-6.3.0/mpn/powerpc64/darwin.m4 | 122 ++ gmp-6.3.0/mpn/powerpc64/elf.m4 | 123 ++ gmp-6.3.0/mpn/powerpc64/logops_n.asm | 151 ++ gmp-6.3.0/mpn/powerpc64/lshift.asm | 207 +++ gmp-6.3.0/mpn/powerpc64/lshiftc.asm | 210 +++ gmp-6.3.0/mpn/powerpc64/mode32/add_n.asm | 86 + gmp-6.3.0/mpn/powerpc64/mode32/addmul_1.asm | 79 + gmp-6.3.0/mpn/powerpc64/mode32/mul_1.asm | 73 + gmp-6.3.0/mpn/powerpc64/mode32/p4/gmp-mparam.h | 182 ++ gmp-6.3.0/mpn/powerpc64/mode32/sqr_diagonal.asm | 117 ++ gmp-6.3.0/mpn/powerpc64/mode32/sub_n.asm | 88 + gmp-6.3.0/mpn/powerpc64/mode32/submul_1.asm | 81 + gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm | 189 +++ gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm | 225 +++ gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm | 43 + gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm | 43 + gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm | 187 +++ gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm | 132 ++ gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm | 146 ++ gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm | 196 +++ gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm | 135 ++ gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm | 274 +++ gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm | 187 +++ gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm | 77 + gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h | 82 + gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm | 88 + gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm | 164 ++ gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm | 270 +++ gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm | 132 ++ gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm | 117 ++ gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm | 168 ++ gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm | 708 ++++++++ gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h | 179 ++ gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h | 214 +++ gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h | 219 +++ gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm | 185 ++ gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h | 160 ++ gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm | 589 +++++++ gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm | 135 ++ gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm | 128 ++ .../mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm | 43 + .../mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm | 43 + .../mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm | 129 ++ gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm | 67 + gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm | 146 ++ gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h | 175 ++ gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h | 171 ++ gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm | 53 + gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm | 112 ++ .../mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm | 106 ++ gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm | 130 ++ gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm | 193 +++ gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm | 179 ++ gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm | 64 + gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm | 143 ++ gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h | 254 +++ gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm | 126 ++ gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm | 181 ++ gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm | 415 +++++ gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm | 555 ++++++ gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm | 173 ++ gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm | 863 ++++++++++ gmp-6.3.0/mpn/powerpc64/p6/lshift.asm | 132 ++ gmp-6.3.0/mpn/powerpc64/p6/lshiftc.asm | 136 ++ gmp-6.3.0/mpn/powerpc64/p6/rshift.asm | 131 ++ gmp-6.3.0/mpn/powerpc64/p7/copyd.asm | 128 ++ gmp-6.3.0/mpn/powerpc64/p7/copyi.asm | 129 ++ gmp-6.3.0/mpn/powerpc64/p7/hamdist.asm | 110 ++ gmp-6.3.0/mpn/powerpc64/p7/popcount.asm | 90 + gmp-6.3.0/mpn/powerpc64/rshift.asm | 207 +++ gmp-6.3.0/mpn/powerpc64/sec_tabselect.asm | 147 ++ gmp-6.3.0/mpn/powerpc64/umul.asm | 53 + gmp-6.3.0/mpn/powerpc64/vmx/popcount.asm | 230 +++ gmp-6.3.0/mpn/powlo.c | 1 + gmp-6.3.0/mpn/powm.c | 1 + gmp-6.3.0/mpn/pre_mod_1.c | 1 + gmp-6.3.0/mpn/random.c | 1 + gmp-6.3.0/mpn/random2.c | 1 + gmp-6.3.0/mpn/redc_1.c | 1 + gmp-6.3.0/mpn/redc_2.c | 1 + gmp-6.3.0/mpn/redc_n.c | 1 + gmp-6.3.0/mpn/remove.c | 1 + gmp-6.3.0/mpn/riscv/64/aors_n.asm | 135 ++ gmp-6.3.0/mpn/riscv/64/aorsmul_1.asm | 75 + gmp-6.3.0/mpn/riscv/64/cnd_aors_n.asm | 97 ++ gmp-6.3.0/mpn/riscv/64/copyd.asm | 87 + gmp-6.3.0/mpn/riscv/64/copyi.asm | 84 + gmp-6.3.0/mpn/riscv/64/lshift.asm | 121 ++ gmp-6.3.0/mpn/riscv/64/mul_1.asm | 58 + gmp-6.3.0/mpn/riscv/64/rshift.asm | 119 ++ gmp-6.3.0/mpn/riscv/64/sec_tabselect.asm | 140 ++ gmp-6.3.0/mpn/rootrem.c | 1 + gmp-6.3.0/mpn/rshift.asm | 1 + gmp-6.3.0/mpn/s390_32/README | 37 + gmp-6.3.0/mpn/s390_32/addmul_1.asm | 93 ++ gmp-6.3.0/mpn/s390_32/copyd.asm | 145 ++ gmp-6.3.0/mpn/s390_32/copyi.asm | 69 + gmp-6.3.0/mpn/s390_32/esame/addmul_1.asm | 72 + gmp-6.3.0/mpn/s390_32/esame/aors_n.asm | 137 ++ gmp-6.3.0/mpn/s390_32/esame/aorslsh1_n.asm | 173 ++ gmp-6.3.0/mpn/s390_32/esame/bdiv_dbm1c.asm | 65 + gmp-6.3.0/mpn/s390_32/esame/gmp-mparam.h | 177 ++ gmp-6.3.0/mpn/s390_32/esame/mul_1.asm | 66 + gmp-6.3.0/mpn/s390_32/esame/mul_basecase.asm | 130 ++ gmp-6.3.0/mpn/s390_32/esame/sqr_basecase.asm | 203 +++ gmp-6.3.0/mpn/s390_32/esame/submul_1.asm | 70 + gmp-6.3.0/mpn/s390_32/gmp-mparam.h | 138 ++ gmp-6.3.0/mpn/s390_32/logops_n.asm | 295 ++++ gmp-6.3.0/mpn/s390_32/lshift.asm | 144 ++ gmp-6.3.0/mpn/s390_32/lshiftc.asm | 156 ++ gmp-6.3.0/mpn/s390_32/mul_1.asm | 85 + gmp-6.3.0/mpn/s390_32/rshift.asm | 138 ++ gmp-6.3.0/mpn/s390_32/sec_tabselect.asm | 140 ++ gmp-6.3.0/mpn/s390_32/submul_1.asm | 93 ++ gmp-6.3.0/mpn/s390_64/README | 88 + gmp-6.3.0/mpn/s390_64/addmul_1.asm | 72 + gmp-6.3.0/mpn/s390_64/aorrlsh1_n.asm | 168 ++ gmp-6.3.0/mpn/s390_64/aors_n.asm | 136 ++ gmp-6.3.0/mpn/s390_64/bdiv_dbm1c.asm | 65 + gmp-6.3.0/mpn/s390_64/copyd.asm | 144 ++ gmp-6.3.0/mpn/s390_64/copyi.asm | 68 + gmp-6.3.0/mpn/s390_64/gmp-mparam.h | 181 ++ gmp-6.3.0/mpn/s390_64/invert_limb.asm | 94 ++ gmp-6.3.0/mpn/s390_64/logops_n.asm | 291 ++++ gmp-6.3.0/mpn/s390_64/lshift.asm | 196 +++ gmp-6.3.0/mpn/s390_64/lshiftc.asm | 207 +++ gmp-6.3.0/mpn/s390_64/mod_34lsub1.asm | 109 ++ gmp-6.3.0/mpn/s390_64/mul_1.asm | 66 + gmp-6.3.0/mpn/s390_64/mul_basecase.asm | 130 ++ gmp-6.3.0/mpn/s390_64/rshift.asm | 195 +++ gmp-6.3.0/mpn/s390_64/sec_tabselect.asm | 139 ++ gmp-6.3.0/mpn/s390_64/sqr_basecase.asm | 203 +++ gmp-6.3.0/mpn/s390_64/sublsh1_n.asm | 169 ++ gmp-6.3.0/mpn/s390_64/submul_1.asm | 70 + gmp-6.3.0/mpn/s390_64/z10/gmp-mparam.h | 233 +++ gmp-6.3.0/mpn/s390_64/z13/addmul_1.asm | 173 ++ gmp-6.3.0/mpn/s390_64/z13/addmul_1.c | 358 ++++ gmp-6.3.0/mpn/s390_64/z13/aormul_2.c | 476 ++++++ gmp-6.3.0/mpn/s390_64/z13/common-vec.h | 175 ++ gmp-6.3.0/mpn/s390_64/z13/gmp-mparam.h | 162 ++ gmp-6.3.0/mpn/s390_64/z13/hamdist.asm | 76 + gmp-6.3.0/mpn/s390_64/z13/mul_1.asm | 149 ++ gmp-6.3.0/mpn/s390_64/z13/mul_1.c | 31 + gmp-6.3.0/mpn/s390_64/z13/mul_2.asm | 121 ++ gmp-6.3.0/mpn/s390_64/z13/mul_basecase.asm | 264 +++ gmp-6.3.0/mpn/s390_64/z13/mul_basecase.c | 124 ++ gmp-6.3.0/mpn/s390_64/z13/popcount.asm | 69 + gmp-6.3.0/mpn/s390_64/z13/sqr_basecase.c | 82 + gmp-6.3.0/mpn/s390_64/z13/submul_1.asm | 168 ++ gmp-6.3.0/mpn/sbpi1_bdiv_q.c | 1 + gmp-6.3.0/mpn/sbpi1_bdiv_qr.c | 1 + gmp-6.3.0/mpn/sbpi1_bdiv_r.c | 1 + gmp-6.3.0/mpn/sbpi1_div_q.c | 1 + gmp-6.3.0/mpn/sbpi1_div_qr.c | 1 + gmp-6.3.0/mpn/sbpi1_divappr_q.c | 1 + gmp-6.3.0/mpn/scan0.c | 1 + gmp-6.3.0/mpn/scan1.c | 1 + gmp-6.3.0/mpn/sec_add_1.c | 1 + gmp-6.3.0/mpn/sec_div_qr.c | 1 + gmp-6.3.0/mpn/sec_div_r.c | 1 + gmp-6.3.0/mpn/sec_invert.c | 1 + gmp-6.3.0/mpn/sec_mul.c | 1 + gmp-6.3.0/mpn/sec_pi1_div_qr.c | 1 + gmp-6.3.0/mpn/sec_pi1_div_r.c | 1 + gmp-6.3.0/mpn/sec_powm.c | 1 + gmp-6.3.0/mpn/sec_sqr.c | 1 + gmp-6.3.0/mpn/sec_sub_1.c | 1 + gmp-6.3.0/mpn/sec_tabselect.asm | 1 + gmp-6.3.0/mpn/set_str.c | 1 + gmp-6.3.0/mpn/sh/add_n.asm | 59 + gmp-6.3.0/mpn/sh/sh2/addmul_1.asm | 65 + gmp-6.3.0/mpn/sh/sh2/mul_1.asm | 62 + gmp-6.3.0/mpn/sh/sh2/submul_1.asm | 65 + gmp-6.3.0/mpn/sh/sub_n.asm | 59 + gmp-6.3.0/mpn/sizeinbase.c | 1 + gmp-6.3.0/mpn/sparc32/README | 71 + gmp-6.3.0/mpn/sparc32/add_n.asm | 245 +++ gmp-6.3.0/mpn/sparc32/addmul_1.asm | 155 ++ gmp-6.3.0/mpn/sparc32/gmp-mparam.h | 67 + gmp-6.3.0/mpn/sparc32/lshift.asm | 105 ++ gmp-6.3.0/mpn/sparc32/mul_1.asm | 146 ++ gmp-6.3.0/mpn/sparc32/rshift.asm | 102 ++ gmp-6.3.0/mpn/sparc32/sparc-defs.m4 | 97 ++ gmp-6.3.0/mpn/sparc32/sub_n.asm | 335 ++++ gmp-6.3.0/mpn/sparc32/submul_1.asm | 155 ++ gmp-6.3.0/mpn/sparc32/udiv.asm | 147 ++ gmp-6.3.0/mpn/sparc32/udiv_nfp.asm | 202 +++ gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm | 70 + gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm | 90 + gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h | 153 ++ gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm | 83 + .../mpn/sparc32/ultrasparct1/sqr_diagonal.asm | 55 + gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm | 70 + gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm | 91 + gmp-6.3.0/mpn/sparc32/umul.asm | 77 + gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm | 109 ++ gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h | 73 + gmp-6.3.0/mpn/sparc32/v8/mul_1.asm | 93 ++ gmp-6.3.0/mpn/sparc32/v8/submul_1.asm | 67 + gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h | 73 + gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm | 131 ++ gmp-6.3.0/mpn/sparc32/v8/udiv.asm | 131 ++ gmp-6.3.0/mpn/sparc32/v8/umul.asm | 40 + gmp-6.3.0/mpn/sparc32/v9/README | 4 + gmp-6.3.0/mpn/sparc32/v9/add_n.asm | 129 ++ gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm | 306 ++++ gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h | 204 +++ gmp-6.3.0/mpn/sparc32/v9/mul_1.asm | 287 ++++ gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm | 462 +++++ gmp-6.3.0/mpn/sparc32/v9/sub_n.asm | 129 ++ gmp-6.3.0/mpn/sparc32/v9/submul_1.asm | 316 ++++ gmp-6.3.0/mpn/sparc32/v9/udiv.asm | 52 + gmp-6.3.0/mpn/sparc64/README | 125 ++ gmp-6.3.0/mpn/sparc64/copyd.asm | 89 + gmp-6.3.0/mpn/sparc64/copyi.asm | 86 + gmp-6.3.0/mpn/sparc64/dive_1.c | 161 ++ gmp-6.3.0/mpn/sparc64/divrem_1.c | 242 +++ gmp-6.3.0/mpn/sparc64/gcd_11.asm | 87 + gmp-6.3.0/mpn/sparc64/gmp-mparam.h | 139 ++ gmp-6.3.0/mpn/sparc64/lshift.asm | 140 ++ gmp-6.3.0/mpn/sparc64/lshiftc.asm | 147 ++ gmp-6.3.0/mpn/sparc64/mod_1.c | 238 +++ gmp-6.3.0/mpn/sparc64/mod_1_4.c | 235 +++ gmp-6.3.0/mpn/sparc64/mode1o.c | 196 +++ gmp-6.3.0/mpn/sparc64/rshift.asm | 142 ++ gmp-6.3.0/mpn/sparc64/sec_tabselect.asm | 162 ++ gmp-6.3.0/mpn/sparc64/sparc64.h | 217 +++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm | 241 +++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm | 606 +++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm | 551 ++++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm | 165 ++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm | 580 +++++++ .../mpn/sparc64/ultrasparc1234/sqr_diagonal.asm | 342 ++++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm | 241 +++ gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm | 68 + gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h | 222 +++ gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm | 68 + gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm | 41 + gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm | 41 + gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm | 69 + gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm | 86 + gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h | 154 ++ gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm | 82 + gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm | 41 + gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm | 41 + gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm | 69 + gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm | 68 + gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm | 41 + gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm | 41 + gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm | 69 + gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm | 86 + gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm | 126 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm | 182 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm | 228 +++ gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm | 219 +++ gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm | 147 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm | 147 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm | 137 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm | 145 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm | 129 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm | 78 + gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm | 92 + gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm | 77 + gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 | 88 + gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm | 233 +++ gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm | 117 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm | 82 + gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm | 174 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm | 70 + .../mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm | 93 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm | 144 ++ gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm | 170 ++ gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h | 174 ++ gmp-6.3.0/mpn/sqr.c | 1 + gmp-6.3.0/mpn/sqr_basecase.asm | 1 + gmp-6.3.0/mpn/sqrlo.c | 1 + gmp-6.3.0/mpn/sqrlo_basecase.c | 1 + gmp-6.3.0/mpn/sqrmod_bnm1.c | 1 + gmp-6.3.0/mpn/sqrtrem.c | 1 + gmp-6.3.0/mpn/strongfibo.c | 1 + gmp-6.3.0/mpn/sub.c | 1 + gmp-6.3.0/mpn/sub_1.c | 1 + gmp-6.3.0/mpn/sub_err1_n.c | 1 + gmp-6.3.0/mpn/sub_err2_n.c | 1 + gmp-6.3.0/mpn/sub_err3_n.c | 1 + gmp-6.3.0/mpn/sub_n.asm | 1 + gmp-6.3.0/mpn/submul_1.asm | 1 + gmp-6.3.0/mpn/tdiv_qr.c | 1 + gmp-6.3.0/mpn/thumb/add_n.asm | 63 + gmp-6.3.0/mpn/thumb/sub_n.asm | 63 + gmp-6.3.0/mpn/toom22_mul.c | 1 + gmp-6.3.0/mpn/toom2_sqr.c | 1 + gmp-6.3.0/mpn/toom32_mul.c | 1 + gmp-6.3.0/mpn/toom33_mul.c | 1 + gmp-6.3.0/mpn/toom3_sqr.c | 1 + gmp-6.3.0/mpn/toom42_mul.c | 1 + gmp-6.3.0/mpn/toom42_mulmid.c | 1 + gmp-6.3.0/mpn/toom43_mul.c | 1 + gmp-6.3.0/mpn/toom44_mul.c | 1 + gmp-6.3.0/mpn/toom4_sqr.c | 1 + gmp-6.3.0/mpn/toom52_mul.c | 1 + gmp-6.3.0/mpn/toom53_mul.c | 1 + gmp-6.3.0/mpn/toom54_mul.c | 1 + gmp-6.3.0/mpn/toom62_mul.c | 1 + gmp-6.3.0/mpn/toom63_mul.c | 1 + gmp-6.3.0/mpn/toom6_sqr.c | 1 + gmp-6.3.0/mpn/toom6h_mul.c | 1 + gmp-6.3.0/mpn/toom8_sqr.c | 1 + gmp-6.3.0/mpn/toom8h_mul.c | 1 + gmp-6.3.0/mpn/toom_couple_handling.c | 1 + gmp-6.3.0/mpn/toom_eval_dgr3_pm1.c | 1 + gmp-6.3.0/mpn/toom_eval_dgr3_pm2.c | 1 + gmp-6.3.0/mpn/toom_eval_pm1.c | 1 + gmp-6.3.0/mpn/toom_eval_pm2.c | 1 + gmp-6.3.0/mpn/toom_eval_pm2exp.c | 1 + gmp-6.3.0/mpn/toom_eval_pm2rexp.c | 1 + gmp-6.3.0/mpn/toom_interpolate_12pts.c | 1 + gmp-6.3.0/mpn/toom_interpolate_16pts.c | 1 + gmp-6.3.0/mpn/toom_interpolate_5pts.c | 1 + gmp-6.3.0/mpn/toom_interpolate_6pts.c | 1 + gmp-6.3.0/mpn/toom_interpolate_7pts.c | 1 + gmp-6.3.0/mpn/toom_interpolate_8pts.c | 1 + gmp-6.3.0/mpn/trialdiv.c | 1 + gmp-6.3.0/mpn/udiv.asm | 1 + gmp-6.3.0/mpn/umul.asm | 1 + gmp-6.3.0/mpn/vax/add_n.asm | 64 + gmp-6.3.0/mpn/vax/addmul_1.asm | 124 ++ gmp-6.3.0/mpn/vax/elf.m4 | 54 + gmp-6.3.0/mpn/vax/gmp-mparam.h | 60 + gmp-6.3.0/mpn/vax/lshift.asm | 59 + gmp-6.3.0/mpn/vax/mul_1.asm | 118 ++ gmp-6.3.0/mpn/vax/rshift.asm | 57 + gmp-6.3.0/mpn/vax/sub_n.asm | 64 + gmp-6.3.0/mpn/vax/submul_1.asm | 124 ++ gmp-6.3.0/mpn/x86/README | 525 ++++++ gmp-6.3.0/mpn/x86/aors_n.asm | 202 +++ gmp-6.3.0/mpn/x86/aorsmul_1.asm | 214 +++ gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm | 53 + gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm | 53 + gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm | 156 ++ gmp-6.3.0/mpn/x86/atom/aors_n.asm | 159 ++ gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm | 247 +++ gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm | 35 + gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm | 113 ++ gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm | 124 ++ gmp-6.3.0/mpn/x86/atom/dive_1.asm | 34 + gmp-6.3.0/mpn/x86/atom/gmp-mparam.h | 214 +++ gmp-6.3.0/mpn/x86/atom/logops_n.asm | 151 ++ gmp-6.3.0/mpn/x86/atom/lshift.asm | 218 +++ gmp-6.3.0/mpn/x86/atom/lshiftc.asm | 159 ++ gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm | 34 + gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm | 34 + gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm | 34 + gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm | 34 + gmp-6.3.0/mpn/x86/atom/mode1o.asm | 34 + gmp-6.3.0/mpn/x86/atom/rshift.asm | 152 ++ gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm | 174 ++ gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm | 34 + gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm | 34 + gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm | 34 + gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm | 34 + gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm | 124 ++ gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm | 501 ++++++ gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm | 35 + gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm | 634 +++++++ gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm | 34 + gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm | 57 + gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h | 211 +++ gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h | 214 +++ gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h | 225 +++ gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm | 129 ++ gmp-6.3.0/mpn/x86/bdiv_q_1.asm | 208 +++ gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h | 218 +++ gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h | 214 +++ gmp-6.3.0/mpn/x86/cnd_aors_n.asm | 124 ++ gmp-6.3.0/mpn/x86/copyd.asm | 91 + gmp-6.3.0/mpn/x86/copyi.asm | 99 ++ gmp-6.3.0/mpn/x86/core2/gmp-mparam.h | 210 +++ gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h | 216 +++ gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h | 216 +++ gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h | 223 +++ gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h | 215 +++ gmp-6.3.0/mpn/x86/darwin.m4 | 102 ++ gmp-6.3.0/mpn/x86/dive_1.asm | 190 +++ gmp-6.3.0/mpn/x86/divrem_1.asm | 233 +++ gmp-6.3.0/mpn/x86/divrem_2.asm | 199 +++ gmp-6.3.0/mpn/x86/fat/com.c | 32 + gmp-6.3.0/mpn/x86/fat/fat.c | 530 ++++++ gmp-6.3.0/mpn/x86/fat/fat_entry.asm | 243 +++ gmp-6.3.0/mpn/x86/fat/gmp-mparam.h | 71 + gmp-6.3.0/mpn/x86/fat/lshiftc.c | 32 + gmp-6.3.0/mpn/x86/fat/mod_1.c | 32 + gmp-6.3.0/mpn/x86/fat/mod_1_1.c | 36 + gmp-6.3.0/mpn/x86/fat/mod_1_2.c | 36 + gmp-6.3.0/mpn/x86/fat/mod_1_4.c | 36 + gmp-6.3.0/mpn/x86/fat/mode1o.c | 32 + gmp-6.3.0/mpn/x86/fat/mullo_basecase.c | 32 + gmp-6.3.0/mpn/x86/fat/redc_1.c | 32 + gmp-6.3.0/mpn/x86/fat/redc_2.c | 32 + gmp-6.3.0/mpn/x86/gcd_11.asm | 126 ++ gmp-6.3.0/mpn/x86/geode/gmp-mparam.h | 141 ++ gmp-6.3.0/mpn/x86/gmp-mparam.h | 38 + gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h | 219 +++ gmp-6.3.0/mpn/x86/i486/gmp-mparam.h | 69 + gmp-6.3.0/mpn/x86/k10/gmp-mparam.h | 217 +++ gmp-6.3.0/mpn/x86/k6/README | 251 +++ gmp-6.3.0/mpn/x86/k6/aors_n.asm | 337 ++++ gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm | 391 +++++ gmp-6.3.0/mpn/x86/k6/cross.pl | 182 ++ gmp-6.3.0/mpn/x86/k6/divrem_1.asm | 203 +++ gmp-6.3.0/mpn/x86/k6/gmp-mparam.h | 166 ++ gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm | 118 ++ gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm | 294 ++++ gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm | 293 ++++ gmp-6.3.0/mpn/x86/k6/mmx/com.asm | 103 ++ gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm | 282 ++++ gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm | 226 +++ gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm | 130 ++ gmp-6.3.0/mpn/x86/k6/mmx/popham.asm | 236 +++ gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm | 130 ++ gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm | 190 +++ gmp-6.3.0/mpn/x86/k6/mode1o.asm | 176 ++ gmp-6.3.0/mpn/x86/k6/mul_1.asm | 292 ++++ gmp-6.3.0/mpn/x86/k6/mul_basecase.asm | 612 +++++++ gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm | 146 ++ gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm | 680 ++++++++ gmp-6.3.0/mpn/x86/k7/README | 174 ++ gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm | 196 +++ gmp-6.3.0/mpn/x86/k7/aors_n.asm | 258 +++ gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm | 167 ++ gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm | 245 +++ gmp-6.3.0/mpn/x86/k7/dive_1.asm | 208 +++ gmp-6.3.0/mpn/x86/k7/gcd_11.asm | 107 ++ gmp-6.3.0/mpn/x86/k7/gmp-mparam.h | 263 +++ gmp-6.3.0/mpn/x86/k7/invert_limb.asm | 194 +++ gmp-6.3.0/mpn/x86/k7/mmx/com.asm | 125 ++ gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm | 144 ++ gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm | 157 ++ gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm | 832 +++++++++ gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm | 481 ++++++ gmp-6.3.0/mpn/x86/k7/mmx/popham.asm | 213 +++ gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm | 480 ++++++ gmp-6.3.0/mpn/x86/k7/mod_1_1.asm | 221 +++ gmp-6.3.0/mpn/x86/k7/mod_1_4.asm | 260 +++ gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm | 188 +++ gmp-6.3.0/mpn/x86/k7/mode1o.asm | 181 ++ gmp-6.3.0/mpn/x86/k7/mul_1.asm | 237 +++ gmp-6.3.0/mpn/x86/k7/mul_basecase.asm | 602 +++++++ gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm | 635 +++++++ gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm | 173 ++ gmp-6.3.0/mpn/x86/k8/gmp-mparam.h | 215 +++ gmp-6.3.0/mpn/x86/lshift.asm | 106 ++ gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm | 163 ++ gmp-6.3.0/mpn/x86/mod_34lsub1.asm | 183 ++ gmp-6.3.0/mpn/x86/mul_1.asm | 140 ++ gmp-6.3.0/mpn/x86/mul_basecase.asm | 223 +++ gmp-6.3.0/mpn/x86/nano/gmp-mparam.h | 162 ++ gmp-6.3.0/mpn/x86/p6/README | 125 ++ gmp-6.3.0/mpn/x86/p6/aors_n.asm | 156 ++ gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm | 320 ++++ gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm | 287 ++++ gmp-6.3.0/mpn/x86/p6/copyd.asm | 178 ++ gmp-6.3.0/mpn/x86/p6/dive_1.asm | 267 +++ gmp-6.3.0/mpn/x86/p6/gcd_11.asm | 83 + gmp-6.3.0/mpn/x86/p6/gmp-mparam.h | 194 +++ gmp-6.3.0/mpn/x86/p6/lshsub_n.asm | 169 ++ gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm | 767 +++++++++ gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h | 218 +++ gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm | 38 + gmp-6.3.0/mpn/x86/p6/mmx/popham.asm | 39 + gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm | 38 + gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm | 190 +++ gmp-6.3.0/mpn/x86/p6/mode1o.asm | 170 ++ gmp-6.3.0/mpn/x86/p6/mul_basecase.asm | 607 +++++++ gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm | 42 + gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm | 649 +++++++ gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm | 37 + gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h | 200 +++ gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm | 34 + gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm | 34 + gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm | 38 + gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm | 35 + gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm | 35 + gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm | 35 + gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm | 35 + gmp-6.3.0/mpn/x86/pentium/README | 181 ++ gmp-6.3.0/mpn/x86/pentium/aors_n.asm | 203 +++ gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm | 144 ++ gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm | 266 +++ gmp-6.3.0/mpn/x86/pentium/com.asm | 181 ++ gmp-6.3.0/mpn/x86/pentium/copyd.asm | 146 ++ gmp-6.3.0/mpn/x86/pentium/copyi.asm | 164 ++ gmp-6.3.0/mpn/x86/pentium/dive_1.asm | 264 +++ gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h | 76 + gmp-6.3.0/mpn/x86/pentium/hamdist.asm | 154 ++ gmp-6.3.0/mpn/x86/pentium/logops_n.asm | 176 ++ gmp-6.3.0/mpn/x86/pentium/lshift.asm | 243 +++ gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h | 163 ++ gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm | 40 + gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm | 463 +++++ gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm | 371 ++++ gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm | 468 ++++++ gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm | 192 +++ gmp-6.3.0/mpn/x86/pentium/mode1o.asm | 279 ++++ gmp-6.3.0/mpn/x86/pentium/mul_1.asm | 177 ++ gmp-6.3.0/mpn/x86/pentium/mul_2.asm | 150 ++ gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm | 142 ++ gmp-6.3.0/mpn/x86/pentium/popcount.asm | 146 ++ gmp-6.3.0/mpn/x86/pentium/rshift.asm | 243 +++ gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm | 528 ++++++ gmp-6.3.0/mpn/x86/pentium4/README | 124 ++ gmp-6.3.0/mpn/x86/pentium4/copyd.asm | 71 + gmp-6.3.0/mpn/x86/pentium4/copyi.asm | 93 ++ gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm | 39 + gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm | 203 +++ gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm | 39 + gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm | 101 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm | 108 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm | 189 +++ gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm | 141 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm | 234 +++ gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm | 95 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm | 114 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm | 216 +++ gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm | 645 +++++++ gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h | 213 +++ gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm | 166 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm | 269 +++ gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm | 175 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm | 175 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm | 164 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm | 662 ++++++++ gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm | 281 ++++ gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm | 126 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm | 705 ++++++++ gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm | 119 ++ gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm | 182 ++ gmp-6.3.0/mpn/x86/rshift.asm | 108 ++ gmp-6.3.0/mpn/x86/sec_tabselect.asm | 106 ++ gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h | 222 +++ gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h | 211 +++ gmp-6.3.0/mpn/x86/sqr_basecase.asm | 359 ++++ gmp-6.3.0/mpn/x86/t-zdisp.sh | 71 + gmp-6.3.0/mpn/x86/t-zdisp2.pl | 147 ++ gmp-6.3.0/mpn/x86/udiv.asm | 52 + gmp-6.3.0/mpn/x86/umul.asm | 51 + gmp-6.3.0/mpn/x86/x86-defs.m4 | 1024 ++++++++++++ gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h | 220 +++ gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h | 226 +++ gmp-6.3.0/mpn/x86_64/README | 74 + gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm | 168 ++ gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h | 225 +++ gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm | 474 ++++++ gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm | 140 ++ gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm | 170 ++ gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm | 53 + gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm | 172 ++ gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm | 176 ++ gmp-6.3.0/mpn/x86_64/aors_err1_n.asm | 225 +++ gmp-6.3.0/mpn/x86_64/aors_err2_n.asm | 172 ++ gmp-6.3.0/mpn/x86_64/aors_err3_n.asm | 156 ++ gmp-6.3.0/mpn/x86_64/aors_n.asm | 178 ++ gmp-6.3.0/mpn/x86_64/aorsmul_1.asm | 190 +++ gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm | 186 +++ gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm | 238 +++ gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm | 191 +++ gmp-6.3.0/mpn/x86_64/atom/aors_n.asm | 128 ++ gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm | 194 +++ gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm | 38 + gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm | 38 + gmp-6.3.0/mpn/x86_64/atom/com.asm | 37 + gmp-6.3.0/mpn/x86_64/atom/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/atom/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/atom/dive_1.asm | 37 + gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h | 222 +++ gmp-6.3.0/mpn/x86_64/atom/lshift.asm | 123 ++ gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm | 127 ++ gmp-6.3.0/mpn/x86_64/atom/mul_1.asm | 147 ++ gmp-6.3.0/mpn/x86_64/atom/mul_2.asm | 190 +++ gmp-6.3.0/mpn/x86_64/atom/popcount.asm | 35 + gmp-6.3.0/mpn/x86_64/atom/redc_1.asm | 579 +++++++ gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm | 287 ++++ gmp-6.3.0/mpn/x86_64/atom/rshift.asm | 121 ++ gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm | 242 +++ gmp-6.3.0/mpn/x86_64/bd1/README | 11 + gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm | 235 +++ gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm | 38 + gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm | 190 +++ gmp-6.3.0/mpn/x86_64/bd1/com.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h | 265 +++ gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm | 206 +++ gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm | 193 +++ gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm | 195 +++ gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm | 416 +++++ gmp-6.3.0/mpn/x86_64/bd1/popcount.asm | 191 +++ gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm | 37 + gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm | 96 ++ gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm | 142 ++ gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h | 263 +++ gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm | 38 + gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm | 96 ++ gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm | 37 + gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h | 266 +++ gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm | 106 ++ gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm | 195 +++ gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm | 159 ++ gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm | 191 +++ gmp-6.3.0/mpn/x86_64/bt1/copyd.asm | 91 + gmp-6.3.0/mpn/x86_64/bt1/copyi.asm | 94 ++ gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm | 119 ++ gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm | 37 + gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h | 230 +++ gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm | 241 +++ gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm | 486 ++++++ gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm | 507 ++++++ gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm | 565 +++++++ gmp-6.3.0/mpn/x86_64/bt2/com.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm | 37 + gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h | 240 +++ gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm | 183 ++ gmp-6.3.0/mpn/x86_64/com.asm | 95 ++ gmp-6.3.0/mpn/x86_64/copyd.asm | 93 ++ gmp-6.3.0/mpn/x86_64/copyi.asm | 92 + gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm | 53 + gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm | 53 + gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm | 38 + gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm | 225 +++ gmp-6.3.0/mpn/x86_64/core2/aors_n.asm | 150 ++ gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm | 188 +++ gmp-6.3.0/mpn/x86_64/core2/com.asm | 37 + gmp-6.3.0/mpn/x86_64/core2/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/core2/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm | 243 +++ gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm | 93 ++ gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm | 137 ++ gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h | 222 +++ gmp-6.3.0/mpn/x86_64/core2/hamdist.asm | 210 +++ gmp-6.3.0/mpn/x86_64/core2/logops_n.asm | 285 ++++ gmp-6.3.0/mpn/x86_64/core2/lshift.asm | 145 ++ gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm | 159 ++ gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm | 975 +++++++++++ gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm | 427 +++++ gmp-6.3.0/mpn/x86_64/core2/popcount.asm | 185 ++ gmp-6.3.0/mpn/x86_64/core2/redc_1.asm | 430 +++++ gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm | 169 ++ gmp-6.3.0/mpn/x86_64/core2/rshift.asm | 143 ++ gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm | 984 +++++++++++ gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm | 47 + gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm | 47 + gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm | 158 ++ gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm | 210 +++ gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h | 246 +++ gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm | 195 +++ gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm | 368 ++++ gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm | 395 +++++ gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm | 710 ++++++++ gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm | 839 ++++++++++ gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm | 241 +++ gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm | 38 + gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm | 261 +++ gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm | 201 +++ gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm | 138 ++ gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h | 253 +++ gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm | 159 ++ gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm | 176 ++ gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm | 441 +++++ gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm | 422 +++++ gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm | 437 +++++ gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm | 506 ++++++ gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm | 200 +++ gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm | 190 +++ gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h | 238 +++ gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm | 196 +++ gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm | 182 ++ gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm | 549 ++++++ gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm | 224 +++ gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm | 54 + gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm | 56 + gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm | 173 ++ gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm | 215 +++ gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm | 203 +++ gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm | 212 +++ gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm | 174 ++ gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm | 200 +++ gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h | 241 +++ gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm | 199 +++ gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm | 167 ++ gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm | 407 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm | 384 +++++ gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm | 546 ++++++ gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm | 193 +++ gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm | 484 ++++++ gmp-6.3.0/mpn/x86_64/darwin.m4 | 82 + gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm | 247 +++ gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm | 158 ++ gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm | 200 +++ gmp-6.3.0/mpn/x86_64/dive_1.asm | 158 ++ gmp-6.3.0/mpn/x86_64/divrem_1.asm | 314 ++++ gmp-6.3.0/mpn/x86_64/divrem_2.asm | 192 +++ gmp-6.3.0/mpn/x86_64/dos64.m4 | 101 ++ gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm | 181 ++ gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm | 178 ++ gmp-6.3.0/mpn/x86_64/fastsse/README | 22 + gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm | 311 ++++ gmp-6.3.0/mpn/x86_64/fastsse/com.asm | 175 ++ gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm | 254 +++ gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm | 166 ++ gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm | 300 ++++ gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm | 185 ++ gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm | 182 ++ gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm | 173 ++ gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm | 193 +++ gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm | 183 ++ gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm | 201 +++ gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm | 204 +++ gmp-6.3.0/mpn/x86_64/fat/addmul_2.c | 38 + gmp-6.3.0/mpn/x86_64/fat/fat.c | 473 ++++++ gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm | 209 +++ gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h | 72 + gmp-6.3.0/mpn/x86_64/fat/mod_1.c | 32 + gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c | 32 + gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c | 32 + gmp-6.3.0/mpn/x86_64/fat/redc_1.c | 32 + gmp-6.3.0/mpn/x86_64/fat/redc_2.c | 32 + gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c | 32 + gmp-6.3.0/mpn/x86_64/gcd_11.asm | 114 ++ gmp-6.3.0/mpn/x86_64/gcd_22.asm | 163 ++ gmp-6.3.0/mpn/x86_64/gmp-mparam.h | 217 +++ gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm | 37 + gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm | 37 + gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h | 264 +++ gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm | 37 + gmp-6.3.0/mpn/x86_64/invert_limb.asm | 112 ++ gmp-6.3.0/mpn/x86_64/invert_limb_table.asm | 50 + gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm | 142 ++ gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h | 248 +++ gmp-6.3.0/mpn/x86_64/k10/hamdist.asm | 109 ++ gmp-6.3.0/mpn/x86_64/k10/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/k10/popcount.asm | 138 ++ gmp-6.3.0/mpn/x86_64/k10/rshift.asm | 37 + gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm | 153 ++ gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm | 195 +++ gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm | 217 +++ gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm | 179 ++ gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm | 249 +++ gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h | 237 +++ gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm | 469 ++++++ gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm | 436 +++++ gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm | 559 +++++++ gmp-6.3.0/mpn/x86_64/k8/redc_1.asm | 591 +++++++ gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm | 807 +++++++++ gmp-6.3.0/mpn/x86_64/logops_n.asm | 260 +++ gmp-6.3.0/mpn/x86_64/lshift.asm | 172 ++ gmp-6.3.0/mpn/x86_64/lshiftc.asm | 182 ++ gmp-6.3.0/mpn/x86_64/lshsub_n.asm | 172 ++ gmp-6.3.0/mpn/x86_64/missing-call.m4 | 53 + gmp-6.3.0/mpn/x86_64/missing-inline.m4 | 100 ++ gmp-6.3.0/mpn/x86_64/missing.asm | 130 ++ gmp-6.3.0/mpn/x86_64/mod_1_1.asm | 238 +++ gmp-6.3.0/mpn/x86_64/mod_1_2.asm | 241 +++ gmp-6.3.0/mpn/x86_64/mod_1_4.asm | 272 +++ gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm | 215 +++ gmp-6.3.0/mpn/x86_64/mode1o.asm | 171 ++ gmp-6.3.0/mpn/x86_64/mul_1.asm | 192 +++ gmp-6.3.0/mpn/x86_64/mul_2.asm | 204 +++ gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm | 157 ++ gmp-6.3.0/mpn/x86_64/nano/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/nano/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/nano/dive_1.asm | 166 ++ gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h | 243 +++ gmp-6.3.0/mpn/x86_64/nano/popcount.asm | 35 + gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm | 196 +++ gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm | 50 + gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm | 50 + gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm | 203 +++ gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h | 257 +++ gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm | 167 ++ gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm | 35 + gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm | 334 ++++ gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm | 169 ++ gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm | 37 + gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/popham.asm | 163 ++ gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm | 189 +++ gmp-6.3.0/mpn/x86_64/rshift.asm | 176 ++ gmp-6.3.0/mpn/x86_64/sec_tabselect.asm | 176 ++ gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm | 50 + gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm | 50 + gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h | 252 +++ gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm | 38 + gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm | 38 + gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm | 37 + gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h | 246 +++ gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm | 116 ++ gmp-6.3.0/mpn/x86_64/sublsh1_n.asm | 160 ++ gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 | 493 ++++++ gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm | 227 +++ gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm | 165 ++ gmp-6.3.0/mpn/x86_64/zen/com.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/copyd.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/copyi.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h | 280 ++++ gmp-6.3.0/mpn/x86_64/zen/hamdist.asm | 38 + gmp-6.3.0/mpn/x86_64/zen/lshift.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/mul_1.asm | 161 ++ gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm | 455 +++++ gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm | 299 ++++ gmp-6.3.0/mpn/x86_64/zen/popcount.asm | 38 + gmp-6.3.0/mpn/x86_64/zen/rshift.asm | 37 + gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm | 507 ++++++ gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm | 482 ++++++ gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm | 37 + gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h | 276 +++ gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm | 37 + gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h | 222 +++ gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm | 208 +++ gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm | 37 + gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm | 37 + gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm | 37 + gmp-6.3.0/mpn/xnor_n.c | 1 + gmp-6.3.0/mpn/xor_n.c | 1 + gmp-6.3.0/mpn/zero.c | 1 + gmp-6.3.0/mpn/zero_p.c | 1 + 1543 files changed, 232330 insertions(+) create mode 100644 gmp-6.3.0/mpn/Makeasm.am create mode 100644 gmp-6.3.0/mpn/Makefile create mode 100644 gmp-6.3.0/mpn/Makefile.am create mode 100644 gmp-6.3.0/mpn/Makefile.in create mode 100644 gmp-6.3.0/mpn/README create mode 120000 gmp-6.3.0/mpn/add.c create mode 120000 gmp-6.3.0/mpn/add_1.c create mode 120000 gmp-6.3.0/mpn/add_err1_n.c create mode 120000 gmp-6.3.0/mpn/add_err2_n.c create mode 120000 gmp-6.3.0/mpn/add_err3_n.c create mode 120000 gmp-6.3.0/mpn/add_n.asm create mode 120000 gmp-6.3.0/mpn/add_n_sub_n.c create mode 120000 gmp-6.3.0/mpn/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/README create mode 100644 gmp-6.3.0/mpn/alpha/add_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/alpha-defs.m4 create mode 100644 gmp-6.3.0/mpn/alpha/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/aorslsh2_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/alpha/cntlz.asm create mode 100644 gmp-6.3.0/mpn/alpha/com.asm create mode 100644 gmp-6.3.0/mpn/alpha/copyd.asm create mode 100644 gmp-6.3.0/mpn/alpha/copyi.asm create mode 100644 gmp-6.3.0/mpn/alpha/default.m4 create mode 100644 gmp-6.3.0/mpn/alpha/dive_1.c create mode 100644 gmp-6.3.0/mpn/alpha/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev5/diveby3.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev5/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/alpha/ev6/add_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/mul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/README create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm create mode 100755 gmp-6.3.0/mpn/alpha/ev6/slot.pl create mode 100644 gmp-6.3.0/mpn/alpha/ev6/sub_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev67/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev67/hamdist.asm create mode 100644 gmp-6.3.0/mpn/alpha/ev67/popcount.asm create mode 100644 gmp-6.3.0/mpn/alpha/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/alpha/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/alpha/lshift.asm create mode 100644 gmp-6.3.0/mpn/alpha/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/alpha/mode1o.asm create mode 100644 gmp-6.3.0/mpn/alpha/mul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/rshift.asm create mode 100644 gmp-6.3.0/mpn/alpha/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/alpha/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/alpha/sub_n.asm create mode 100644 gmp-6.3.0/mpn/alpha/submul_1.asm create mode 100644 gmp-6.3.0/mpn/alpha/umul.asm create mode 100644 gmp-6.3.0/mpn/alpha/unicos.m4 create mode 120000 gmp-6.3.0/mpn/and_n.c create mode 120000 gmp-6.3.0/mpn/andn_n.c create mode 100644 gmp-6.3.0/mpn/arm/README create mode 100644 gmp-6.3.0/mpn/arm/aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/arm/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/arm-defs.m4 create mode 100644 gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/arm/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/com.asm create mode 100644 gmp-6.3.0/mpn/arm/copyd.asm create mode 100644 gmp-6.3.0/mpn/arm/copyi.asm create mode 100644 gmp-6.3.0/mpn/arm/dive_1.asm create mode 100644 gmp-6.3.0/mpn/arm/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/arm/logops_n.asm create mode 100644 gmp-6.3.0/mpn/arm/lshift.asm create mode 100644 gmp-6.3.0/mpn/arm/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/arm/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/arm/mode1o.asm create mode 100644 gmp-6.3.0/mpn/arm/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/README create mode 100644 gmp-6.3.0/mpn/arm/neon/hamdist.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/lorrshift.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/popcount.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/arm/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/rshift.asm create mode 100644 gmp-6.3.0/mpn/arm/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/arm/udiv.asm create mode 100644 gmp-6.3.0/mpn/arm/v5/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/arm/v5/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/arm/v5/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v5/mod_1_2.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/addmul_3.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/dive_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v6/mode1o.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/mul_2.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/popham.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/submul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/com.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm64/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm64/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/arm64/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/arm64/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/arm64/applem1/addaddmul_1msb0.asm create mode 100644 gmp-6.3.0/mpn/arm64/applem1/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm64/applem1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm64/applem1/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/arm64/arm64-defs.m4 create mode 100644 gmp-6.3.0/mpn/arm64/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/arm64/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm64/com.asm create mode 100644 gmp-6.3.0/mpn/arm64/copyd.asm create mode 100644 gmp-6.3.0/mpn/arm64/copyi.asm create mode 100644 gmp-6.3.0/mpn/arm64/cora53/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm64/cora53/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm64/cora57/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm64/cora72/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm64/cora73/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm64/darwin.m4 create mode 100644 gmp-6.3.0/mpn/arm64/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/arm64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/arm64/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/arm64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm64/hamdist.asm create mode 100644 gmp-6.3.0/mpn/arm64/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/arm64/logops_n.asm create mode 100644 gmp-6.3.0/mpn/arm64/lshift.asm create mode 100644 gmp-6.3.0/mpn/arm64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/arm64/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/arm64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm64/popcount.asm create mode 100644 gmp-6.3.0/mpn/arm64/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm64/rshift.asm create mode 100644 gmp-6.3.0/mpn/arm64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/arm64/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/arm64/xgene1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/asm-defs.m4 create mode 120000 gmp-6.3.0/mpn/bdiv_dbm1c.asm create mode 120000 gmp-6.3.0/mpn/bdiv_q.c create mode 120000 gmp-6.3.0/mpn/bdiv_q_1.asm create mode 120000 gmp-6.3.0/mpn/bdiv_qr.c create mode 120000 gmp-6.3.0/mpn/binvert.c create mode 120000 gmp-6.3.0/mpn/broot.c create mode 120000 gmp-6.3.0/mpn/brootinv.c create mode 120000 gmp-6.3.0/mpn/bsqrt.c create mode 120000 gmp-6.3.0/mpn/bsqrtinv.c create mode 120000 gmp-6.3.0/mpn/cmp.c create mode 120000 gmp-6.3.0/mpn/cnd_add_n.asm create mode 120000 gmp-6.3.0/mpn/cnd_sub_n.asm create mode 120000 gmp-6.3.0/mpn/cnd_swap.c create mode 120000 gmp-6.3.0/mpn/com.c create mode 120000 gmp-6.3.0/mpn/comb_tables.c create mode 120000 gmp-6.3.0/mpn/compute_powtab.c create mode 120000 gmp-6.3.0/mpn/copyd.asm create mode 120000 gmp-6.3.0/mpn/copyi.asm create mode 100755 gmp-6.3.0/mpn/cpp-ccas create mode 100644 gmp-6.3.0/mpn/cray/README create mode 100644 gmp-6.3.0/mpn/cray/add_n.c create mode 100644 gmp-6.3.0/mpn/cray/cfp/addmul_1.c create mode 100644 gmp-6.3.0/mpn/cray/cfp/mul_1.c create mode 100644 gmp-6.3.0/mpn/cray/cfp/mulwwc90.s create mode 100644 gmp-6.3.0/mpn/cray/cfp/mulwwj90.s create mode 100644 gmp-6.3.0/mpn/cray/cfp/submul_1.c create mode 100644 gmp-6.3.0/mpn/cray/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/cray/hamdist.c create mode 100644 gmp-6.3.0/mpn/cray/ieee/addmul_1.c create mode 100644 gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/cray/ieee/invert_limb.c create mode 100644 gmp-6.3.0/mpn/cray/ieee/mul_1.c create mode 100644 gmp-6.3.0/mpn/cray/ieee/mul_basecase.c create mode 100644 gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c create mode 100644 gmp-6.3.0/mpn/cray/ieee/submul_1.c create mode 100644 gmp-6.3.0/mpn/cray/lshift.c create mode 100644 gmp-6.3.0/mpn/cray/mulww.f create mode 100644 gmp-6.3.0/mpn/cray/popcount.c create mode 100644 gmp-6.3.0/mpn/cray/rshift.c create mode 100644 gmp-6.3.0/mpn/cray/sub_n.c create mode 120000 gmp-6.3.0/mpn/dcpi1_bdiv_q.c create mode 120000 gmp-6.3.0/mpn/dcpi1_bdiv_qr.c create mode 120000 gmp-6.3.0/mpn/dcpi1_div_q.c create mode 120000 gmp-6.3.0/mpn/dcpi1_div_qr.c create mode 120000 gmp-6.3.0/mpn/dcpi1_divappr_q.c create mode 120000 gmp-6.3.0/mpn/div_q.c create mode 120000 gmp-6.3.0/mpn/div_qr_1.c create mode 120000 gmp-6.3.0/mpn/div_qr_1n_pi1.c create mode 120000 gmp-6.3.0/mpn/div_qr_2.c create mode 120000 gmp-6.3.0/mpn/div_qr_2n_pi1.c create mode 120000 gmp-6.3.0/mpn/div_qr_2u_pi1.c create mode 120000 gmp-6.3.0/mpn/dive_1.asm create mode 120000 gmp-6.3.0/mpn/diveby3.c create mode 120000 gmp-6.3.0/mpn/divexact.c create mode 120000 gmp-6.3.0/mpn/divis.c create mode 120000 gmp-6.3.0/mpn/divrem.c create mode 120000 gmp-6.3.0/mpn/divrem_1.asm create mode 120000 gmp-6.3.0/mpn/divrem_2.asm create mode 120000 gmp-6.3.0/mpn/dump.c create mode 120000 gmp-6.3.0/mpn/fib2_ui.c create mode 120000 gmp-6.3.0/mpn/fib2m.c create mode 100644 gmp-6.3.0/mpn/fib_table.c create mode 120000 gmp-6.3.0/mpn/gcd.c create mode 120000 gmp-6.3.0/mpn/gcd_1.c create mode 120000 gmp-6.3.0/mpn/gcd_11.asm create mode 120000 gmp-6.3.0/mpn/gcd_22.c create mode 120000 gmp-6.3.0/mpn/gcd_subdiv_step.c create mode 120000 gmp-6.3.0/mpn/gcdext.c create mode 120000 gmp-6.3.0/mpn/gcdext_1.c create mode 120000 gmp-6.3.0/mpn/gcdext_lehmer.c create mode 100644 gmp-6.3.0/mpn/generic/add.c create mode 100644 gmp-6.3.0/mpn/generic/add_1.c create mode 100644 gmp-6.3.0/mpn/generic/add_err1_n.c create mode 100644 gmp-6.3.0/mpn/generic/add_err2_n.c create mode 100644 gmp-6.3.0/mpn/generic/add_err3_n.c create mode 100644 gmp-6.3.0/mpn/generic/add_n.c create mode 100644 gmp-6.3.0/mpn/generic/add_n_sub_n.c create mode 100644 gmp-6.3.0/mpn/generic/addmul_1.c create mode 100644 gmp-6.3.0/mpn/generic/bdiv_dbm1c.c create mode 100644 gmp-6.3.0/mpn/generic/bdiv_q.c create mode 100644 gmp-6.3.0/mpn/generic/bdiv_q_1.c create mode 100644 gmp-6.3.0/mpn/generic/bdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/binvert.c create mode 100644 gmp-6.3.0/mpn/generic/broot.c create mode 100644 gmp-6.3.0/mpn/generic/brootinv.c create mode 100644 gmp-6.3.0/mpn/generic/bsqrt.c create mode 100644 gmp-6.3.0/mpn/generic/bsqrtinv.c create mode 100644 gmp-6.3.0/mpn/generic/cmp.c create mode 100644 gmp-6.3.0/mpn/generic/cnd_add_n.c create mode 100644 gmp-6.3.0/mpn/generic/cnd_sub_n.c create mode 100644 gmp-6.3.0/mpn/generic/cnd_swap.c create mode 100644 gmp-6.3.0/mpn/generic/com.c create mode 100644 gmp-6.3.0/mpn/generic/comb_tables.c create mode 100644 gmp-6.3.0/mpn/generic/compute_powtab.c create mode 100644 gmp-6.3.0/mpn/generic/copyd.c create mode 100644 gmp-6.3.0/mpn/generic/copyi.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_div_q.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_div_qr.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c create mode 100644 gmp-6.3.0/mpn/generic/div_q.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_2.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c create mode 100644 gmp-6.3.0/mpn/generic/dive_1.c create mode 100644 gmp-6.3.0/mpn/generic/diveby3.c create mode 100644 gmp-6.3.0/mpn/generic/divexact.c create mode 100644 gmp-6.3.0/mpn/generic/divis.c create mode 100644 gmp-6.3.0/mpn/generic/divrem.c create mode 100644 gmp-6.3.0/mpn/generic/divrem_1.c create mode 100644 gmp-6.3.0/mpn/generic/divrem_2.c create mode 100644 gmp-6.3.0/mpn/generic/dump.c create mode 100644 gmp-6.3.0/mpn/generic/fib2_ui.c create mode 100644 gmp-6.3.0/mpn/generic/fib2m.c create mode 100644 gmp-6.3.0/mpn/generic/gcd.c create mode 100644 gmp-6.3.0/mpn/generic/gcd_1.c create mode 100644 gmp-6.3.0/mpn/generic/gcd_11.c create mode 100644 gmp-6.3.0/mpn/generic/gcd_22.c create mode 100644 gmp-6.3.0/mpn/generic/gcd_subdiv_step.c create mode 100644 gmp-6.3.0/mpn/generic/gcdext.c create mode 100644 gmp-6.3.0/mpn/generic/gcdext_1.c create mode 100644 gmp-6.3.0/mpn/generic/gcdext_lehmer.c create mode 100644 gmp-6.3.0/mpn/generic/get_d.c create mode 100644 gmp-6.3.0/mpn/generic/get_str.c create mode 100644 gmp-6.3.0/mpn/generic/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/generic/hgcd.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd2-div.h create mode 100644 gmp-6.3.0/mpn/generic/hgcd2.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd2_jacobi.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_appr.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_jacobi.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_matrix.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_reduce.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_step.c create mode 100644 gmp-6.3.0/mpn/generic/invert.c create mode 100644 gmp-6.3.0/mpn/generic/invertappr.c create mode 100644 gmp-6.3.0/mpn/generic/jacbase.c create mode 100644 gmp-6.3.0/mpn/generic/jacobi.c create mode 100644 gmp-6.3.0/mpn/generic/jacobi_2.c create mode 100644 gmp-6.3.0/mpn/generic/logops_n.c create mode 100644 gmp-6.3.0/mpn/generic/lshift.c create mode 100644 gmp-6.3.0/mpn/generic/lshiftc.c create mode 100644 gmp-6.3.0/mpn/generic/matrix22_mul.c create mode 100644 gmp-6.3.0/mpn/generic/matrix22_mul1_inverse_vector.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1_1.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1_2.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1_3.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1_4.c create mode 100644 gmp-6.3.0/mpn/generic/mod_34lsub1.c create mode 100644 gmp-6.3.0/mpn/generic/mode1o.c create mode 100644 gmp-6.3.0/mpn/generic/mu_bdiv_q.c create mode 100644 gmp-6.3.0/mpn/generic/mu_bdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/mu_div_q.c create mode 100644 gmp-6.3.0/mpn/generic/mu_div_qr.c create mode 100644 gmp-6.3.0/mpn/generic/mu_divappr_q.c create mode 100644 gmp-6.3.0/mpn/generic/mul.c create mode 100644 gmp-6.3.0/mpn/generic/mul_1.c create mode 100644 gmp-6.3.0/mpn/generic/mul_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/mul_fft.c create mode 100644 gmp-6.3.0/mpn/generic/mul_n.c create mode 100644 gmp-6.3.0/mpn/generic/mullo_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/mullo_n.c create mode 100644 gmp-6.3.0/mpn/generic/mulmid.c create mode 100644 gmp-6.3.0/mpn/generic/mulmid_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/mulmid_n.c create mode 100644 gmp-6.3.0/mpn/generic/mulmod_bknp1.c create mode 100644 gmp-6.3.0/mpn/generic/mulmod_bnm1.c create mode 100644 gmp-6.3.0/mpn/generic/neg.c create mode 100644 gmp-6.3.0/mpn/generic/nussbaumer_mul.c create mode 100644 gmp-6.3.0/mpn/generic/perfpow.c create mode 100644 gmp-6.3.0/mpn/generic/perfsqr.c create mode 100644 gmp-6.3.0/mpn/generic/popham.c create mode 100644 gmp-6.3.0/mpn/generic/pow_1.c create mode 100644 gmp-6.3.0/mpn/generic/powlo.c create mode 100644 gmp-6.3.0/mpn/generic/powm.c create mode 100644 gmp-6.3.0/mpn/generic/pre_divrem_1.c create mode 100644 gmp-6.3.0/mpn/generic/pre_mod_1.c create mode 100644 gmp-6.3.0/mpn/generic/random.c create mode 100644 gmp-6.3.0/mpn/generic/random2.c create mode 100644 gmp-6.3.0/mpn/generic/redc_1.c create mode 100644 gmp-6.3.0/mpn/generic/redc_2.c create mode 100644 gmp-6.3.0/mpn/generic/redc_n.c create mode 100644 gmp-6.3.0/mpn/generic/remove.c create mode 100644 gmp-6.3.0/mpn/generic/rootrem.c create mode 100644 gmp-6.3.0/mpn/generic/rshift.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_div_q.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_div_qr.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c create mode 100644 gmp-6.3.0/mpn/generic/scan0.c create mode 100644 gmp-6.3.0/mpn/generic/scan1.c create mode 100644 gmp-6.3.0/mpn/generic/sec_aors_1.c create mode 100644 gmp-6.3.0/mpn/generic/sec_div.c create mode 100644 gmp-6.3.0/mpn/generic/sec_invert.c create mode 100644 gmp-6.3.0/mpn/generic/sec_mul.c create mode 100644 gmp-6.3.0/mpn/generic/sec_pi1_div.c create mode 100644 gmp-6.3.0/mpn/generic/sec_powm.c create mode 100644 gmp-6.3.0/mpn/generic/sec_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/sec_tabselect.c create mode 100644 gmp-6.3.0/mpn/generic/set_str.c create mode 100644 gmp-6.3.0/mpn/generic/sizeinbase.c create mode 100644 gmp-6.3.0/mpn/generic/sqr.c create mode 100644 gmp-6.3.0/mpn/generic/sqr_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/sqrlo.c create mode 100644 gmp-6.3.0/mpn/generic/sqrlo_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/sqrmod_bnm1.c create mode 100644 gmp-6.3.0/mpn/generic/sqrtrem.c create mode 100644 gmp-6.3.0/mpn/generic/strongfibo.c create mode 100644 gmp-6.3.0/mpn/generic/sub.c create mode 100644 gmp-6.3.0/mpn/generic/sub_1.c create mode 100644 gmp-6.3.0/mpn/generic/sub_err1_n.c create mode 100644 gmp-6.3.0/mpn/generic/sub_err2_n.c create mode 100644 gmp-6.3.0/mpn/generic/sub_err3_n.c create mode 100644 gmp-6.3.0/mpn/generic/sub_n.c create mode 100644 gmp-6.3.0/mpn/generic/submul_1.c create mode 100644 gmp-6.3.0/mpn/generic/tdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/toom22_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom2_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom32_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom33_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom3_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom42_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom42_mulmid.c create mode 100644 gmp-6.3.0/mpn/generic/toom43_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom44_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom4_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom52_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom53_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom54_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom62_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom63_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom6_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom6h_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom8_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom8h_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom_couple_handling.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_pm1.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_pm2.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_pm2rexp.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_12pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c create mode 100644 gmp-6.3.0/mpn/generic/trialdiv.c create mode 100644 gmp-6.3.0/mpn/generic/udiv_w_sdiv.c create mode 100644 gmp-6.3.0/mpn/generic/zero.c create mode 100644 gmp-6.3.0/mpn/generic/zero_p.c create mode 120000 gmp-6.3.0/mpn/get_d.c create mode 120000 gmp-6.3.0/mpn/get_str.c create mode 120000 gmp-6.3.0/mpn/hamdist.asm create mode 120000 gmp-6.3.0/mpn/hgcd.c create mode 120000 gmp-6.3.0/mpn/hgcd2.c create mode 120000 gmp-6.3.0/mpn/hgcd2_jacobi.c create mode 120000 gmp-6.3.0/mpn/hgcd_appr.c create mode 120000 gmp-6.3.0/mpn/hgcd_jacobi.c create mode 120000 gmp-6.3.0/mpn/hgcd_matrix.c create mode 120000 gmp-6.3.0/mpn/hgcd_reduce.c create mode 120000 gmp-6.3.0/mpn/hgcd_step.c create mode 100644 gmp-6.3.0/mpn/ia64/README create mode 100644 gmp-6.3.0/mpn/ia64/add_n_sub_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/ia64/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/ia64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/ia64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/copyd.asm create mode 100644 gmp-6.3.0/mpn/ia64/copyi.asm create mode 100644 gmp-6.3.0/mpn/ia64/dive_1.asm create mode 100644 gmp-6.3.0/mpn/ia64/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/ia64/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/ia64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/ia64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/ia64/hamdist.asm create mode 100644 gmp-6.3.0/mpn/ia64/ia64-defs.m4 create mode 100644 gmp-6.3.0/mpn/ia64/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/ia64/logops_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/lorrshift.asm create mode 100644 gmp-6.3.0/mpn/ia64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/ia64/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/ia64/mode1o.asm create mode 100644 gmp-6.3.0/mpn/ia64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/ia64/mul_2.asm create mode 100644 gmp-6.3.0/mpn/ia64/popcount.asm create mode 100644 gmp-6.3.0/mpn/ia64/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/ia64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/ia64/submul_1.asm create mode 120000 gmp-6.3.0/mpn/invert.c create mode 120000 gmp-6.3.0/mpn/invertappr.c create mode 120000 gmp-6.3.0/mpn/ior_n.c create mode 120000 gmp-6.3.0/mpn/iorn_n.c create mode 120000 gmp-6.3.0/mpn/jacbase.c create mode 120000 gmp-6.3.0/mpn/jacobi.c create mode 120000 gmp-6.3.0/mpn/jacobi_2.c create mode 100644 gmp-6.3.0/mpn/jacobitab.h create mode 100644 gmp-6.3.0/mpn/lisp/gmpasm-mode.el create mode 100644 gmp-6.3.0/mpn/loongarch/64/add_n.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/aorslsh2_n.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/aorslshC_n.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/copyd.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/copyi.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/lshift.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/rshift.asm create mode 100644 gmp-6.3.0/mpn/loongarch/64/sub_n.asm create mode 120000 gmp-6.3.0/mpn/lshift.asm create mode 120000 gmp-6.3.0/mpn/lshiftc.c create mode 100755 gmp-6.3.0/mpn/m4-ccas create mode 100644 gmp-6.3.0/mpn/m68k/README create mode 100644 gmp-6.3.0/mpn/m68k/aors_n.asm create mode 100644 gmp-6.3.0/mpn/m68k/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/m68k/lshift.asm create mode 100644 gmp-6.3.0/mpn/m68k/m68k-defs.m4 create mode 100644 gmp-6.3.0/mpn/m68k/mc68020/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/m68k/mc68020/mul_1.asm create mode 100644 gmp-6.3.0/mpn/m68k/mc68020/udiv.asm create mode 100644 gmp-6.3.0/mpn/m68k/mc68020/umul.asm create mode 100644 gmp-6.3.0/mpn/m68k/rshift.asm create mode 100644 gmp-6.3.0/mpn/m68k/t-m68k-defs.pl create mode 100644 gmp-6.3.0/mpn/m88k/README create mode 100644 gmp-6.3.0/mpn/m88k/add_n.s create mode 100644 gmp-6.3.0/mpn/m88k/mc88110/add_n.S create mode 100644 gmp-6.3.0/mpn/m88k/mc88110/addmul_1.s create mode 100644 gmp-6.3.0/mpn/m88k/mc88110/mul_1.s create mode 100644 gmp-6.3.0/mpn/m88k/mc88110/sub_n.S create mode 100644 gmp-6.3.0/mpn/m88k/mul_1.s create mode 100644 gmp-6.3.0/mpn/m88k/sub_n.s create mode 120000 gmp-6.3.0/mpn/matrix22_mul.c create mode 120000 gmp-6.3.0/mpn/matrix22_mul1_inverse_vector.c create mode 100644 gmp-6.3.0/mpn/minithres/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/mips32/add_n.asm create mode 100644 gmp-6.3.0/mpn/mips32/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/mips32/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/mips32/lshift.asm create mode 100644 gmp-6.3.0/mpn/mips32/mips-defs.m4 create mode 100644 gmp-6.3.0/mpn/mips32/mips.m4 create mode 100644 gmp-6.3.0/mpn/mips32/mul_1.asm create mode 100644 gmp-6.3.0/mpn/mips32/rshift.asm create mode 100644 gmp-6.3.0/mpn/mips32/sub_n.asm create mode 100644 gmp-6.3.0/mpn/mips32/submul_1.asm create mode 100644 gmp-6.3.0/mpn/mips32/umul.asm create mode 100644 gmp-6.3.0/mpn/mips64/README create mode 100644 gmp-6.3.0/mpn/mips64/add_n.asm create mode 100644 gmp-6.3.0/mpn/mips64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/mips64/hilo/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/mips64/hilo/mul_1.asm create mode 100644 gmp-6.3.0/mpn/mips64/hilo/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/mips64/hilo/submul_1.asm create mode 100644 gmp-6.3.0/mpn/mips64/hilo/umul.asm create mode 100644 gmp-6.3.0/mpn/mips64/lshift.asm create mode 100644 gmp-6.3.0/mpn/mips64/rshift.asm create mode 100644 gmp-6.3.0/mpn/mips64/sub_n.asm create mode 120000 gmp-6.3.0/mpn/mod_1.c create mode 120000 gmp-6.3.0/mpn/mod_1_1.asm create mode 120000 gmp-6.3.0/mpn/mod_1_2.c create mode 120000 gmp-6.3.0/mpn/mod_1_3.c create mode 120000 gmp-6.3.0/mpn/mod_1_4.asm create mode 120000 gmp-6.3.0/mpn/mod_34lsub1.asm create mode 120000 gmp-6.3.0/mpn/mode1o.asm create mode 100644 gmp-6.3.0/mpn/mp_bases.c create mode 120000 gmp-6.3.0/mpn/mu_bdiv_q.c create mode 120000 gmp-6.3.0/mpn/mu_bdiv_qr.c create mode 120000 gmp-6.3.0/mpn/mu_div_q.c create mode 120000 gmp-6.3.0/mpn/mu_div_qr.c create mode 120000 gmp-6.3.0/mpn/mu_divappr_q.c create mode 120000 gmp-6.3.0/mpn/mul.c create mode 120000 gmp-6.3.0/mpn/mul_1.asm create mode 120000 gmp-6.3.0/mpn/mul_basecase.asm create mode 120000 gmp-6.3.0/mpn/mul_fft.c create mode 120000 gmp-6.3.0/mpn/mul_n.c create mode 120000 gmp-6.3.0/mpn/mullo_basecase.c create mode 120000 gmp-6.3.0/mpn/mullo_n.c create mode 120000 gmp-6.3.0/mpn/mulmid.c create mode 120000 gmp-6.3.0/mpn/mulmid_basecase.c create mode 120000 gmp-6.3.0/mpn/mulmid_n.c create mode 120000 gmp-6.3.0/mpn/mulmod_bknp1.c create mode 120000 gmp-6.3.0/mpn/mulmod_bnm1.c create mode 120000 gmp-6.3.0/mpn/nand_n.c create mode 120000 gmp-6.3.0/mpn/neg.c create mode 120000 gmp-6.3.0/mpn/nior_n.c create mode 120000 gmp-6.3.0/mpn/nussbaumer_mul.c create mode 100644 gmp-6.3.0/mpn/pa32/README create mode 100644 gmp-6.3.0/mpn/pa32/add_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/lshift.asm create mode 100644 gmp-6.3.0/mpn/pa32/pa-defs.m4 create mode 100644 gmp-6.3.0/mpn/pa32/rshift.asm create mode 100644 gmp-6.3.0/mpn/pa32/sub_n.asm create mode 100644 gmp-6.3.0/mpn/pa32/udiv.asm create mode 100644 gmp-6.3.0/mpn/pa64/README create mode 100644 gmp-6.3.0/mpn/pa64/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/pa64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/pa64/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/pa64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/pa64/lshift.asm create mode 100644 gmp-6.3.0/mpn/pa64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/pa64/rshift.asm create mode 100644 gmp-6.3.0/mpn/pa64/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/pa64/submul_1.asm create mode 100644 gmp-6.3.0/mpn/pa64/udiv.asm create mode 100644 gmp-6.3.0/mpn/pa64/umul.asm create mode 120000 gmp-6.3.0/mpn/perfpow.c create mode 120000 gmp-6.3.0/mpn/perfsqr.c create mode 100644 gmp-6.3.0/mpn/perfsqr.h create mode 120000 gmp-6.3.0/mpn/popcount.asm create mode 120000 gmp-6.3.0/mpn/pow_1.c create mode 100644 gmp-6.3.0/mpn/power/add_n.asm create mode 100644 gmp-6.3.0/mpn/power/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/power/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/power/lshift.asm create mode 100644 gmp-6.3.0/mpn/power/mul_1.asm create mode 100644 gmp-6.3.0/mpn/power/rshift.asm create mode 100644 gmp-6.3.0/mpn/power/sdiv.asm create mode 100644 gmp-6.3.0/mpn/power/sub_n.asm create mode 100644 gmp-6.3.0/mpn/power/submul_1.asm create mode 100644 gmp-6.3.0/mpn/power/umul.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/750/com.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/750/lshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/750/rshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/README create mode 100644 gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/aix.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/darwin.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/diveby3.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/eabi.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/elf.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/lshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/mode1o.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/mul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 create mode 100644 gmp-6.3.0/mpn/powerpc32/rshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/submul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/umul.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/README create mode 100644 gmp-6.3.0/mpn/powerpc64/aix.m4 create mode 100644 gmp-6.3.0/mpn/powerpc64/com.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/copyd.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/copyi.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/darwin.m4 create mode 100644 gmp-6.3.0/mpn/powerpc64/elf.m4 create mode 100644 gmp-6.3.0/mpn/powerpc64/logops_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/lshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode32/add_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode32/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode32/mul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode32/p4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode32/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode32/sub_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode32/submul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/p6/lshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/p6/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/p6/rshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/p7/copyd.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/p7/copyi.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/p7/hamdist.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/p7/popcount.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/rshift.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/umul.asm create mode 100644 gmp-6.3.0/mpn/powerpc64/vmx/popcount.asm create mode 120000 gmp-6.3.0/mpn/powlo.c create mode 120000 gmp-6.3.0/mpn/powm.c create mode 120000 gmp-6.3.0/mpn/pre_mod_1.c create mode 120000 gmp-6.3.0/mpn/random.c create mode 120000 gmp-6.3.0/mpn/random2.c create mode 120000 gmp-6.3.0/mpn/redc_1.c create mode 120000 gmp-6.3.0/mpn/redc_2.c create mode 120000 gmp-6.3.0/mpn/redc_n.c create mode 120000 gmp-6.3.0/mpn/remove.c create mode 100644 gmp-6.3.0/mpn/riscv/64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/riscv/64/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/riscv/64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/riscv/64/copyd.asm create mode 100644 gmp-6.3.0/mpn/riscv/64/copyi.asm create mode 100644 gmp-6.3.0/mpn/riscv/64/lshift.asm create mode 100644 gmp-6.3.0/mpn/riscv/64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/riscv/64/rshift.asm create mode 100644 gmp-6.3.0/mpn/riscv/64/sec_tabselect.asm create mode 120000 gmp-6.3.0/mpn/rootrem.c create mode 120000 gmp-6.3.0/mpn/rshift.asm create mode 100644 gmp-6.3.0/mpn/s390_32/README create mode 100644 gmp-6.3.0/mpn/s390_32/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_32/copyd.asm create mode 100644 gmp-6.3.0/mpn/s390_32/copyi.asm create mode 100644 gmp-6.3.0/mpn/s390_32/esame/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_32/esame/aors_n.asm create mode 100644 gmp-6.3.0/mpn/s390_32/esame/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/s390_32/esame/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/s390_32/esame/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/s390_32/esame/mul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_32/esame/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/s390_32/esame/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/s390_32/esame/submul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_32/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/s390_32/logops_n.asm create mode 100644 gmp-6.3.0/mpn/s390_32/lshift.asm create mode 100644 gmp-6.3.0/mpn/s390_32/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/s390_32/mul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_32/rshift.asm create mode 100644 gmp-6.3.0/mpn/s390_32/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/s390_32/submul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_64/README create mode 100644 gmp-6.3.0/mpn/s390_64/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_64/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/s390_64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/s390_64/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/s390_64/copyd.asm create mode 100644 gmp-6.3.0/mpn/s390_64/copyi.asm create mode 100644 gmp-6.3.0/mpn/s390_64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/s390_64/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/s390_64/logops_n.asm create mode 100644 gmp-6.3.0/mpn/s390_64/lshift.asm create mode 100644 gmp-6.3.0/mpn/s390_64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/s390_64/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/s390_64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_64/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/s390_64/rshift.asm create mode 100644 gmp-6.3.0/mpn/s390_64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/s390_64/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/s390_64/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/s390_64/submul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_64/z10/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/s390_64/z13/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_64/z13/addmul_1.c create mode 100644 gmp-6.3.0/mpn/s390_64/z13/aormul_2.c create mode 100644 gmp-6.3.0/mpn/s390_64/z13/common-vec.h create mode 100644 gmp-6.3.0/mpn/s390_64/z13/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/s390_64/z13/hamdist.asm create mode 100644 gmp-6.3.0/mpn/s390_64/z13/mul_1.asm create mode 100644 gmp-6.3.0/mpn/s390_64/z13/mul_1.c create mode 100644 gmp-6.3.0/mpn/s390_64/z13/mul_2.asm create mode 100644 gmp-6.3.0/mpn/s390_64/z13/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/s390_64/z13/mul_basecase.c create mode 100644 gmp-6.3.0/mpn/s390_64/z13/popcount.asm create mode 100644 gmp-6.3.0/mpn/s390_64/z13/sqr_basecase.c create mode 100644 gmp-6.3.0/mpn/s390_64/z13/submul_1.asm create mode 120000 gmp-6.3.0/mpn/sbpi1_bdiv_q.c create mode 120000 gmp-6.3.0/mpn/sbpi1_bdiv_qr.c create mode 120000 gmp-6.3.0/mpn/sbpi1_bdiv_r.c create mode 120000 gmp-6.3.0/mpn/sbpi1_div_q.c create mode 120000 gmp-6.3.0/mpn/sbpi1_div_qr.c create mode 120000 gmp-6.3.0/mpn/sbpi1_divappr_q.c create mode 120000 gmp-6.3.0/mpn/scan0.c create mode 120000 gmp-6.3.0/mpn/scan1.c create mode 120000 gmp-6.3.0/mpn/sec_add_1.c create mode 120000 gmp-6.3.0/mpn/sec_div_qr.c create mode 120000 gmp-6.3.0/mpn/sec_div_r.c create mode 120000 gmp-6.3.0/mpn/sec_invert.c create mode 120000 gmp-6.3.0/mpn/sec_mul.c create mode 120000 gmp-6.3.0/mpn/sec_pi1_div_qr.c create mode 120000 gmp-6.3.0/mpn/sec_pi1_div_r.c create mode 120000 gmp-6.3.0/mpn/sec_powm.c create mode 120000 gmp-6.3.0/mpn/sec_sqr.c create mode 120000 gmp-6.3.0/mpn/sec_sub_1.c create mode 120000 gmp-6.3.0/mpn/sec_tabselect.asm create mode 120000 gmp-6.3.0/mpn/set_str.c create mode 100644 gmp-6.3.0/mpn/sh/add_n.asm create mode 100644 gmp-6.3.0/mpn/sh/sh2/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sh/sh2/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sh/sh2/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sh/sub_n.asm create mode 120000 gmp-6.3.0/mpn/sizeinbase.c create mode 100644 gmp-6.3.0/mpn/sparc32/README create mode 100644 gmp-6.3.0/mpn/sparc32/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/lshift.asm create mode 100644 gmp-6.3.0/mpn/sparc32/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/rshift.asm create mode 100644 gmp-6.3.0/mpn/sparc32/sparc-defs.m4 create mode 100644 gmp-6.3.0/mpn/sparc32/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/udiv.asm create mode 100644 gmp-6.3.0/mpn/sparc32/udiv_nfp.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/umul.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/v8/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/udiv.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v8/umul.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/README create mode 100644 gmp-6.3.0/mpn/sparc32/v9/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc32/v9/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc32/v9/udiv.asm create mode 100644 gmp-6.3.0/mpn/sparc64/README create mode 100644 gmp-6.3.0/mpn/sparc64/copyd.asm create mode 100644 gmp-6.3.0/mpn/sparc64/copyi.asm create mode 100644 gmp-6.3.0/mpn/sparc64/dive_1.c create mode 100644 gmp-6.3.0/mpn/sparc64/divrem_1.c create mode 100644 gmp-6.3.0/mpn/sparc64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/sparc64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc64/lshift.asm create mode 100644 gmp-6.3.0/mpn/sparc64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/sparc64/mod_1.c create mode 100644 gmp-6.3.0/mpn/sparc64/mod_1_4.c create mode 100644 gmp-6.3.0/mpn/sparc64/mode1o.c create mode 100644 gmp-6.3.0/mpn/sparc64/rshift.asm create mode 100644 gmp-6.3.0/mpn/sparc64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/sparc64/sparc64.h create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm create mode 100644 gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h create mode 120000 gmp-6.3.0/mpn/sqr.c create mode 120000 gmp-6.3.0/mpn/sqr_basecase.asm create mode 120000 gmp-6.3.0/mpn/sqrlo.c create mode 120000 gmp-6.3.0/mpn/sqrlo_basecase.c create mode 120000 gmp-6.3.0/mpn/sqrmod_bnm1.c create mode 120000 gmp-6.3.0/mpn/sqrtrem.c create mode 120000 gmp-6.3.0/mpn/strongfibo.c create mode 120000 gmp-6.3.0/mpn/sub.c create mode 120000 gmp-6.3.0/mpn/sub_1.c create mode 120000 gmp-6.3.0/mpn/sub_err1_n.c create mode 120000 gmp-6.3.0/mpn/sub_err2_n.c create mode 120000 gmp-6.3.0/mpn/sub_err3_n.c create mode 120000 gmp-6.3.0/mpn/sub_n.asm create mode 120000 gmp-6.3.0/mpn/submul_1.asm create mode 120000 gmp-6.3.0/mpn/tdiv_qr.c create mode 100644 gmp-6.3.0/mpn/thumb/add_n.asm create mode 100644 gmp-6.3.0/mpn/thumb/sub_n.asm create mode 120000 gmp-6.3.0/mpn/toom22_mul.c create mode 120000 gmp-6.3.0/mpn/toom2_sqr.c create mode 120000 gmp-6.3.0/mpn/toom32_mul.c create mode 120000 gmp-6.3.0/mpn/toom33_mul.c create mode 120000 gmp-6.3.0/mpn/toom3_sqr.c create mode 120000 gmp-6.3.0/mpn/toom42_mul.c create mode 120000 gmp-6.3.0/mpn/toom42_mulmid.c create mode 120000 gmp-6.3.0/mpn/toom43_mul.c create mode 120000 gmp-6.3.0/mpn/toom44_mul.c create mode 120000 gmp-6.3.0/mpn/toom4_sqr.c create mode 120000 gmp-6.3.0/mpn/toom52_mul.c create mode 120000 gmp-6.3.0/mpn/toom53_mul.c create mode 120000 gmp-6.3.0/mpn/toom54_mul.c create mode 120000 gmp-6.3.0/mpn/toom62_mul.c create mode 120000 gmp-6.3.0/mpn/toom63_mul.c create mode 120000 gmp-6.3.0/mpn/toom6_sqr.c create mode 120000 gmp-6.3.0/mpn/toom6h_mul.c create mode 120000 gmp-6.3.0/mpn/toom8_sqr.c create mode 120000 gmp-6.3.0/mpn/toom8h_mul.c create mode 120000 gmp-6.3.0/mpn/toom_couple_handling.c create mode 120000 gmp-6.3.0/mpn/toom_eval_dgr3_pm1.c create mode 120000 gmp-6.3.0/mpn/toom_eval_dgr3_pm2.c create mode 120000 gmp-6.3.0/mpn/toom_eval_pm1.c create mode 120000 gmp-6.3.0/mpn/toom_eval_pm2.c create mode 120000 gmp-6.3.0/mpn/toom_eval_pm2exp.c create mode 120000 gmp-6.3.0/mpn/toom_eval_pm2rexp.c create mode 120000 gmp-6.3.0/mpn/toom_interpolate_12pts.c create mode 120000 gmp-6.3.0/mpn/toom_interpolate_16pts.c create mode 120000 gmp-6.3.0/mpn/toom_interpolate_5pts.c create mode 120000 gmp-6.3.0/mpn/toom_interpolate_6pts.c create mode 120000 gmp-6.3.0/mpn/toom_interpolate_7pts.c create mode 120000 gmp-6.3.0/mpn/toom_interpolate_8pts.c create mode 120000 gmp-6.3.0/mpn/trialdiv.c create mode 120000 gmp-6.3.0/mpn/udiv.asm create mode 120000 gmp-6.3.0/mpn/umul.asm create mode 100644 gmp-6.3.0/mpn/vax/add_n.asm create mode 100644 gmp-6.3.0/mpn/vax/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/vax/elf.m4 create mode 100644 gmp-6.3.0/mpn/vax/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/vax/lshift.asm create mode 100644 gmp-6.3.0/mpn/vax/mul_1.asm create mode 100644 gmp-6.3.0/mpn/vax/rshift.asm create mode 100644 gmp-6.3.0/mpn/vax/sub_n.asm create mode 100644 gmp-6.3.0/mpn/vax/submul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/README create mode 100644 gmp-6.3.0/mpn/x86/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/atom/logops_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/mode1o.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/x86/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86/core2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/darwin.m4 create mode 100644 gmp-6.3.0/mpn/x86/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/x86/fat/com.c create mode 100644 gmp-6.3.0/mpn/x86/fat/fat.c create mode 100644 gmp-6.3.0/mpn/x86/fat/fat_entry.asm create mode 100644 gmp-6.3.0/mpn/x86/fat/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/fat/lshiftc.c create mode 100644 gmp-6.3.0/mpn/x86/fat/mod_1.c create mode 100644 gmp-6.3.0/mpn/x86/fat/mod_1_1.c create mode 100644 gmp-6.3.0/mpn/x86/fat/mod_1_2.c create mode 100644 gmp-6.3.0/mpn/x86/fat/mod_1_4.c create mode 100644 gmp-6.3.0/mpn/x86/fat/mode1o.c create mode 100644 gmp-6.3.0/mpn/x86/fat/mullo_basecase.c create mode 100644 gmp-6.3.0/mpn/x86/fat/redc_1.c create mode 100644 gmp-6.3.0/mpn/x86/fat/redc_2.c create mode 100644 gmp-6.3.0/mpn/x86/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86/geode/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/i486/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/k10/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/k6/README create mode 100644 gmp-6.3.0/mpn/x86/k6/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm create mode 100755 gmp-6.3.0/mpn/x86/k6/cross.pl create mode 100644 gmp-6.3.0/mpn/x86/k6/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mmx/com.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mmx/popham.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mode1o.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/pre_mod_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k6/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/README create mode 100644 gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/k7/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mmx/com.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mmx/popham.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mode1o.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86/k8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/nano/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/p6/README create mode 100644 gmp-6.3.0/mpn/x86/p6/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/p6/lshsub_n.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/mmx/popham.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/mode1o.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/README create mode 100644 gmp-6.3.0/mpn/x86/pentium/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/com.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/pentium/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/logops_n.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mode1o.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/README create mode 100644 gmp-6.3.0/mpn/x86/pentium4/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm create mode 100644 gmp-6.3.0/mpn/x86/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/sqr_basecase.asm create mode 100755 gmp-6.3.0/mpn/x86/t-zdisp.sh create mode 100755 gmp-6.3.0/mpn/x86/t-zdisp2.pl create mode 100644 gmp-6.3.0/mpn/x86/udiv.asm create mode 100644 gmp-6.3.0/mpn/x86/umul.asm create mode 100644 gmp-6.3.0/mpn/x86/x86-defs.m4 create mode 100644 gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/README create mode 100644 gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aors_err1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aors_err2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aors_err3_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/atom/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/README create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/core2/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/logops_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/darwin.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/divrem_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/dos64.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/README create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fat/addmul_2.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/fat.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm create mode 100644 gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/fat/mod_1.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/redc_1.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/redc_2.c create mode 100644 gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c create mode 100644 gmp-6.3.0/mpn/x86_64/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/x86_64/invert_limb_table.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/k10/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/logops_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/lshsub_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/missing-call.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/missing-inline.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/missing.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mod_1_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mod_1_4.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mode1o.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/dive_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/nano/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/popham.asm create mode 100644 gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 create mode 100644 gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/com.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/copyd.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/copyi.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/zen/hamdist.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/lshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/popcount.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/rshift.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm create mode 100644 gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm create mode 120000 gmp-6.3.0/mpn/xnor_n.c create mode 120000 gmp-6.3.0/mpn/xor_n.c create mode 120000 gmp-6.3.0/mpn/zero.c create mode 120000 gmp-6.3.0/mpn/zero_p.c (limited to 'gmp-6.3.0/mpn') diff --git a/gmp-6.3.0/mpn/Makeasm.am b/gmp-6.3.0/mpn/Makeasm.am new file mode 100644 index 0000000..5d7306c --- /dev/null +++ b/gmp-6.3.0/mpn/Makeasm.am @@ -0,0 +1,118 @@ +## Automake asm file rules. + +# Copyright 1996, 1998-2002 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# COMPILE minus CC. +# +COMPILE_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $(ASMFLAGS) + +# Flags used for preprocessing (in ansi2knr rules). +# +PREPROCESS_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) + + +# Recent versions of automake (1.5 and up for instance) append automake +# generated suffixes to this $(SUFFIXES) list. This is essential for us, +# since .c must come after .s, .S and .asm. If .c is before .s, for +# instance, then in the mpn directory "make" will see add_n.c mentioned in +# an explicit rule (the ansi2knr stuff) and decide it must have add_n.c, +# even if add_n.c doesn't exist but add_n.s does. See GNU make +# documentation "(make)Implicit Rule Search", part 5c. +# +# On IRIX 6 native make this doesn't work properly though. Somehow .c +# remains ahead of .s, perhaps because .c.s is a builtin rule. .asm works +# fine though, and mpn/mips3 uses this. +# +SUFFIXES = .s .S .asm + + +# .s assembler, no preprocessing. +# +.s.o: + $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< +.s.obj: + $(CCAS) $(COMPILE_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` +.s.lo: + $(LIBTOOL) --mode=compile --tag=CC $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + + +# can be overridden during development, eg. "make RM_TMP=: mul_1.lo" +RM_TMP = rm -f + + +# .S assembler, preprocessed with cpp. +# +# It's necessary to run $(CPP) separately, since it seems not all compilers +# recognise .S files, in particular "cc" on HP-UX 10 and 11 doesn't (and +# will silently do nothing if given a .S). +# +# For .lo we need a helper script, as described below for .asm.lo. +# +.S.o: + $(CPP) $(PREPROCESS_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.obj: + $(CPP) $(PREPROCESS_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/cpp-ccas --cpp="$(CPP) $(PREPROCESS_FLAGS)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + + +# .asm assembler, preprocessed with m4. +# +# .o and .obj are non-PIC and just need m4 followed by a compile. +# +# .lo is a bit tricky. Libtool (as of version 1.5) has foo.lo as a little +# text file, and .libs/foo.o and foo.o as the PIC and non-PIC objects, +# respectively. It'd be asking for lots of trouble to try to create foo.lo +# ourselves, so instead arrange to invoke libtool like a --mode=compile, but +# with a special m4-ccas script which first m4 preprocesses, then compiles. +# --tag=CC is necessary since foo.asm is otherwise unknown to libtool. +# +# Libtool adds -DPIC when building a shared object and the .asm files look +# for that. But it should be noted that the other PIC flags are on occasion +# important too, in particular FreeBSD 2.2.8 gas 1.92.3 requires -k before +# it accepts PIC constructs like @GOT, and gcc adds that flag only under +# -fPIC. (Later versions of gas are happy to accept PIC stuff any time.) +# +.asm.o: + $(M4) -DOPERATION_$* `test -f '$<' || echo '$(srcdir)/'`$< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.obj: + $(M4) -DOPERATION_$* `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< diff --git a/gmp-6.3.0/mpn/Makefile b/gmp-6.3.0/mpn/Makefile new file mode 100644 index 0000000..33d5aed --- /dev/null +++ b/gmp-6.3.0/mpn/Makefile @@ -0,0 +1,772 @@ +# Makefile.in generated by automake 1.15 from Makefile.am. +# mpn/Makefile. Generated from Makefile.in by configure. + +# Copyright (C) 1994-2014 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + + +# Copyright 1996, 1998-2002, 2005, 2011, 2013 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + +# Copyright 1996, 1998-2002 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/gmp +pkgincludedir = $(includedir)/gmp +pkglibdir = $(libdir)/gmp +pkglibexecdir = $(libexecdir)/gmp +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = pentiumm-pc-linux-gnu +host_triplet = pentiumm-pc-linux-gnu +subdir = mpn +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +am__DEPENDENCIES_1 = +nodist_libmpn_la_OBJECTS = fib_table.lo mp_bases.lo +libmpn_la_OBJECTS = $(nodist_libmpn_la_OBJECTS) +AM_V_lt = $(am__v_lt_$(V)) +am__v_lt_ = $(am__v_lt_$(AM_DEFAULT_VERBOSITY)) +am__v_lt_0 = --silent +am__v_lt_1 = +AM_V_P = $(am__v_P_$(V)) +am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY)) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_$(V)) +am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_$(V)) +am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I. -I$(top_builddir) +depcomp = +am__depfiles_maybe = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_$(V)) +am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_$(V)) +am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(nodist_libmpn_la_SOURCES) +DIST_SOURCES = +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makeasm.am $(srcdir)/Makefile.in README +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ABI = 32 +ACLOCAL = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing aclocal-1.15 +AMTAR = $${TAR-tar} +AM_DEFAULT_VERBOSITY = 1 +AR = ar +AS = as +ASMFLAGS = -Wa,--noexecstack +AUTOCONF = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing autoconf +AUTOHEADER = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing autoheader +AUTOMAKE = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing automake-1.15 +AWK = gawk +CALLING_CONVENTIONS_OBJS = x86call.lo x86check$U.lo +CC = gcc +CCAS = gcc -c +CC_FOR_BUILD = gcc +CFLAGS = -m32 -O2 -pedantic -fomit-frame-pointer -mtune=pentium3 -march=pentium3 +CPP = gcc -E +CPPFLAGS = +CPP_FOR_BUILD = gcc -E +CXX = +CXXCPP = +CXXFLAGS = +CYGPATH_W = echo +DEFN_LONG_LONG_LIMB = /* #undef _LONG_LONG_LIMB */ +DEFS = -DHAVE_CONFIG_H +DLLTOOL = dlltool +DSYMUTIL = +DUMPBIN = +ECHO_C = +ECHO_N = -n +ECHO_T = +EGREP = /usr/bin/grep -E +EXEEXT = +EXEEXT_FOR_BUILD = +FGREP = /usr/bin/grep -F +GMP_LDFLAGS = +GMP_LIMB_BITS = 32 +GMP_NAIL_BITS = 0 +GREP = /usr/bin/grep +HAVE_CLOCK_01 = 1 +HAVE_CPUTIME_01 = 0 +HAVE_GETRUSAGE_01 = 1 +HAVE_GETTIMEOFDAY_01 = 1 +HAVE_HOST_CPU_FAMILY_power = 0 +HAVE_HOST_CPU_FAMILY_powerpc = 0 +HAVE_SIGACTION_01 = 1 +HAVE_SIGALTSTACK_01 = 1 +HAVE_SIGSTACK_01 = 1 +HAVE_STACK_T_01 = 1 +HAVE_SYS_RESOURCE_H_01 = 1 +INSTALL = /usr/bin/install -c +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_PROGRAM = ${INSTALL} +INSTALL_SCRIPT = ${INSTALL} +INSTALL_STRIP_PROGRAM = $(install_sh) -c -s +LD = /usr/bin/ld +LDFLAGS = +LEX = flex +LEXLIB = -lfl +LEX_OUTPUT_ROOT = lex.yy +LIBCURSES = -lncurses +LIBGMPXX_LDFLAGS = +LIBGMP_DLL = 0 +LIBGMP_LDFLAGS = +LIBM = -lm +LIBM_FOR_BUILD = -lm +LIBOBJS = +LIBREADLINE = -lreadline +LIBS = +LIBTOOL = $(SHELL) $(top_builddir)/libtool +LIPO = +LN_S = ln -s +LTLIBOBJS = +LT_SYS_LIBRARY_PATH = +M4 = m4 +MAINT = # +MAKEINFO = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/missing makeinfo +MANIFEST_TOOL = : +MKDIR_P = /usr/bin/mkdir -p +NM = /usr/bin/nm -B +NMEDIT = +OBJDUMP = objdump +OBJEXT = o +OTOOL = +OTOOL64 = +PACKAGE = gmp +PACKAGE_BUGREPORT = gmp-bugs@gmplib.org (see https://gmplib.org/manual/Reporting-Bugs.html) +PACKAGE_NAME = GNU MP +PACKAGE_STRING = GNU MP 6.3.0 +PACKAGE_TARNAME = gmp +PACKAGE_URL = http://www.gnu.org/software/gmp/ +PACKAGE_VERSION = 6.3.0 +PATH_SEPARATOR = : +RANLIB = ranlib +SED = /usr/bin/sed +SET_MAKE = +SHELL = /bin/sh +SPEED_CYCLECOUNTER_OBJ = pentium.lo +STRIP = strip +TAL_OBJECT = tal-reent.lo +TUNE_LIBS = +TUNE_SQR_OBJ = +U_FOR_BUILD = +VERSION = 6.3.0 +WITH_READLINE_01 = 1 +YACC = bison -y +YFLAGS = +abs_builddir = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/mpn +abs_srcdir = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/mpn +abs_top_builddir = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0 +abs_top_srcdir = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0 +ac_ct_AR = ar +ac_ct_CC = gcc +ac_ct_CXX = +ac_ct_DUMPBIN = +am__leading_dot = . +am__tar = $${TAR-tar} chof - "$$tardir" +am__untar = $${TAR-tar} xf - +bindir = ${exec_prefix}/bin +build = pentiumm-pc-linux-gnu +build_alias = +build_cpu = pentiumm +build_os = linux-gnu +build_vendor = pc +builddir = . +datadir = ${datarootdir} +datarootdir = ${prefix}/share +docdir = ${datarootdir}/doc/${PACKAGE_TARNAME} +dvidir = ${docdir} +exec_prefix = ${prefix} +gmp_srclinks = mpn/add.c mpn/add_1.c mpn/add_n.asm mpn/sub.c mpn/sub_1.c mpn/sub_n.asm mpn/cnd_add_n.asm mpn/cnd_sub_n.asm mpn/cnd_swap.c mpn/neg.c mpn/com.c mpn/mul_1.asm mpn/addmul_1.asm mpn/submul_1.asm mpn/add_err1_n.c mpn/add_err2_n.c mpn/add_err3_n.c mpn/sub_err1_n.c mpn/sub_err2_n.c mpn/sub_err3_n.c mpn/lshift.asm mpn/rshift.asm mpn/dive_1.asm mpn/diveby3.c mpn/divis.c mpn/divrem.c mpn/divrem_1.asm mpn/divrem_2.asm mpn/fib2_ui.c mpn/fib2m.c mpn/mod_1.c mpn/mod_34lsub1.asm mpn/mode1o.asm mpn/pre_mod_1.c mpn/dump.c mpn/mod_1_1.asm mpn/mod_1_2.c mpn/mod_1_3.c mpn/mod_1_4.asm mpn/lshiftc.c mpn/mul.c mpn/mul_fft.c mpn/mul_n.c mpn/sqr.c mpn/mul_basecase.asm mpn/sqr_basecase.asm mpn/nussbaumer_mul.c mpn/mulmid_basecase.c mpn/toom42_mulmid.c mpn/mulmid_n.c mpn/mulmid.c mpn/random.c mpn/random2.c mpn/pow_1.c mpn/rootrem.c mpn/sqrtrem.c mpn/sizeinbase.c mpn/get_str.c mpn/set_str.c mpn/compute_powtab.c mpn/scan0.c mpn/scan1.c mpn/popcount.asm mpn/hamdist.asm mpn/cmp.c mpn/zero_p.c mpn/perfsqr.c mpn/perfpow.c mpn/strongfibo.c mpn/gcd_11.asm mpn/gcd_22.c mpn/gcd_1.c mpn/gcd.c mpn/gcdext_1.c mpn/gcdext.c mpn/gcd_subdiv_step.c mpn/gcdext_lehmer.c mpn/div_q.c mpn/tdiv_qr.c mpn/jacbase.c mpn/jacobi_2.c mpn/jacobi.c mpn/get_d.c mpn/matrix22_mul.c mpn/matrix22_mul1_inverse_vector.c mpn/hgcd_matrix.c mpn/hgcd2.c mpn/hgcd_step.c mpn/hgcd_reduce.c mpn/hgcd.c mpn/hgcd_appr.c mpn/hgcd2_jacobi.c mpn/hgcd_jacobi.c mpn/mullo_n.c mpn/mullo_basecase.c mpn/sqrlo.c mpn/sqrlo_basecase.c mpn/toom22_mul.c mpn/toom32_mul.c mpn/toom42_mul.c mpn/toom52_mul.c mpn/toom62_mul.c mpn/toom33_mul.c mpn/toom43_mul.c mpn/toom53_mul.c mpn/toom54_mul.c mpn/toom63_mul.c mpn/toom44_mul.c mpn/toom6h_mul.c mpn/toom6_sqr.c mpn/toom8h_mul.c mpn/toom8_sqr.c mpn/toom_couple_handling.c mpn/toom2_sqr.c mpn/toom3_sqr.c mpn/toom4_sqr.c mpn/toom_eval_dgr3_pm1.c mpn/toom_eval_dgr3_pm2.c mpn/toom_eval_pm1.c mpn/toom_eval_pm2.c mpn/toom_eval_pm2exp.c mpn/toom_eval_pm2rexp.c mpn/toom_interpolate_5pts.c mpn/toom_interpolate_6pts.c mpn/toom_interpolate_7pts.c mpn/toom_interpolate_8pts.c mpn/toom_interpolate_12pts.c mpn/toom_interpolate_16pts.c mpn/invertappr.c mpn/invert.c mpn/binvert.c mpn/mulmod_bnm1.c mpn/sqrmod_bnm1.c mpn/mulmod_bknp1.c mpn/div_qr_1.c mpn/div_qr_1n_pi1.c mpn/div_qr_2.c mpn/div_qr_2n_pi1.c mpn/div_qr_2u_pi1.c mpn/sbpi1_div_q.c mpn/sbpi1_div_qr.c mpn/sbpi1_divappr_q.c mpn/dcpi1_div_q.c mpn/dcpi1_div_qr.c mpn/dcpi1_divappr_q.c mpn/mu_div_qr.c mpn/mu_divappr_q.c mpn/mu_div_q.c mpn/bdiv_q_1.asm mpn/sbpi1_bdiv_q.c mpn/sbpi1_bdiv_qr.c mpn/sbpi1_bdiv_r.c mpn/dcpi1_bdiv_q.c mpn/dcpi1_bdiv_qr.c mpn/mu_bdiv_q.c mpn/mu_bdiv_qr.c mpn/bdiv_q.c mpn/bdiv_qr.c mpn/broot.c mpn/brootinv.c mpn/bsqrt.c mpn/bsqrtinv.c mpn/divexact.c mpn/bdiv_dbm1c.asm mpn/redc_1.c mpn/redc_2.c mpn/redc_n.c mpn/powm.c mpn/powlo.c mpn/sec_powm.c mpn/sec_mul.c mpn/sec_sqr.c mpn/sec_div_qr.c mpn/sec_div_r.c mpn/sec_pi1_div_qr.c mpn/sec_pi1_div_r.c mpn/sec_add_1.c mpn/sec_sub_1.c mpn/sec_invert.c mpn/trialdiv.c mpn/remove.c mpn/and_n.c mpn/andn_n.c mpn/nand_n.c mpn/ior_n.c mpn/iorn_n.c mpn/nior_n.c mpn/xor_n.c mpn/xnor_n.c mpn/copyi.asm mpn/copyd.asm mpn/zero.c mpn/sec_tabselect.asm mpn/comb_tables.c mpn/umul.asm mpn/udiv.asm mpn/add_n_sub_n.c gmp-mparam.h +host = pentiumm-pc-linux-gnu +host_alias = +host_cpu = pentiumm +host_os = linux-gnu +host_vendor = pc +htmldir = ${docdir} +includedir = ${prefix}/include +infodir = ${datarootdir}/info +install_sh = ${SHELL} /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/install-sh +libdir = ${exec_prefix}/lib +libexecdir = ${exec_prefix}/libexec +localedir = ${datarootdir}/locale +localstatedir = ${prefix}/var +mandir = ${datarootdir}/man +mkdir_p = $(MKDIR_P) +mpn_objects = add$U.lo add_1$U.lo add_n.lo sub$U.lo sub_1$U.lo sub_n.lo cnd_add_n.lo cnd_sub_n.lo cnd_swap$U.lo neg$U.lo com$U.lo mul_1.lo addmul_1.lo submul_1.lo add_err1_n$U.lo add_err2_n$U.lo add_err3_n$U.lo sub_err1_n$U.lo sub_err2_n$U.lo sub_err3_n$U.lo lshift.lo rshift.lo dive_1.lo diveby3$U.lo divis$U.lo divrem$U.lo divrem_1.lo divrem_2.lo fib2_ui$U.lo fib2m$U.lo mod_1$U.lo mod_34lsub1.lo mode1o.lo pre_mod_1$U.lo dump$U.lo mod_1_1.lo mod_1_2$U.lo mod_1_3$U.lo mod_1_4.lo lshiftc$U.lo mul$U.lo mul_fft$U.lo mul_n$U.lo sqr$U.lo mul_basecase.lo sqr_basecase.lo nussbaumer_mul$U.lo mulmid_basecase$U.lo toom42_mulmid$U.lo mulmid_n$U.lo mulmid$U.lo random$U.lo random2$U.lo pow_1$U.lo rootrem$U.lo sqrtrem$U.lo sizeinbase$U.lo get_str$U.lo set_str$U.lo compute_powtab$U.lo scan0$U.lo scan1$U.lo popcount.lo hamdist.lo cmp$U.lo zero_p$U.lo perfsqr$U.lo perfpow$U.lo strongfibo$U.lo gcd_11.lo gcd_22$U.lo gcd_1$U.lo gcd$U.lo gcdext_1$U.lo gcdext$U.lo gcd_subdiv_step$U.lo gcdext_lehmer$U.lo div_q$U.lo tdiv_qr$U.lo jacbase$U.lo jacobi_2$U.lo jacobi$U.lo get_d$U.lo matrix22_mul$U.lo matrix22_mul1_inverse_vector$U.lo hgcd_matrix$U.lo hgcd2$U.lo hgcd_step$U.lo hgcd_reduce$U.lo hgcd$U.lo hgcd_appr$U.lo hgcd2_jacobi$U.lo hgcd_jacobi$U.lo mullo_n$U.lo mullo_basecase$U.lo sqrlo$U.lo sqrlo_basecase$U.lo toom22_mul$U.lo toom32_mul$U.lo toom42_mul$U.lo toom52_mul$U.lo toom62_mul$U.lo toom33_mul$U.lo toom43_mul$U.lo toom53_mul$U.lo toom54_mul$U.lo toom63_mul$U.lo toom44_mul$U.lo toom6h_mul$U.lo toom6_sqr$U.lo toom8h_mul$U.lo toom8_sqr$U.lo toom_couple_handling$U.lo toom2_sqr$U.lo toom3_sqr$U.lo toom4_sqr$U.lo toom_eval_dgr3_pm1$U.lo toom_eval_dgr3_pm2$U.lo toom_eval_pm1$U.lo toom_eval_pm2$U.lo toom_eval_pm2exp$U.lo toom_eval_pm2rexp$U.lo toom_interpolate_5pts$U.lo toom_interpolate_6pts$U.lo toom_interpolate_7pts$U.lo toom_interpolate_8pts$U.lo toom_interpolate_12pts$U.lo toom_interpolate_16pts$U.lo invertappr$U.lo invert$U.lo binvert$U.lo mulmod_bnm1$U.lo sqrmod_bnm1$U.lo mulmod_bknp1$U.lo div_qr_1$U.lo div_qr_1n_pi1$U.lo div_qr_2$U.lo div_qr_2n_pi1$U.lo div_qr_2u_pi1$U.lo sbpi1_div_q$U.lo sbpi1_div_qr$U.lo sbpi1_divappr_q$U.lo dcpi1_div_q$U.lo dcpi1_div_qr$U.lo dcpi1_divappr_q$U.lo mu_div_qr$U.lo mu_divappr_q$U.lo mu_div_q$U.lo bdiv_q_1.lo sbpi1_bdiv_q$U.lo sbpi1_bdiv_qr$U.lo sbpi1_bdiv_r$U.lo dcpi1_bdiv_q$U.lo dcpi1_bdiv_qr$U.lo mu_bdiv_q$U.lo mu_bdiv_qr$U.lo bdiv_q$U.lo bdiv_qr$U.lo broot$U.lo brootinv$U.lo bsqrt$U.lo bsqrtinv$U.lo divexact$U.lo bdiv_dbm1c.lo redc_1$U.lo redc_2$U.lo redc_n$U.lo powm$U.lo powlo$U.lo sec_powm$U.lo sec_mul$U.lo sec_sqr$U.lo sec_div_qr$U.lo sec_div_r$U.lo sec_pi1_div_qr$U.lo sec_pi1_div_r$U.lo sec_add_1$U.lo sec_sub_1$U.lo sec_invert$U.lo trialdiv$U.lo remove$U.lo and_n$U.lo andn_n$U.lo nand_n$U.lo ior_n$U.lo iorn_n$U.lo nior_n$U.lo xor_n$U.lo xnor_n$U.lo copyi.lo copyd.lo zero$U.lo sec_tabselect.lo comb_tables$U.lo umul.lo udiv.lo add_n_sub_n$U.lo +mpn_objs_in_libgmp = mpn/add$U.lo mpn/add_1$U.lo mpn/add_n.lo mpn/sub$U.lo mpn/sub_1$U.lo mpn/sub_n.lo mpn/cnd_add_n.lo mpn/cnd_sub_n.lo mpn/cnd_swap$U.lo mpn/neg$U.lo mpn/com$U.lo mpn/mul_1.lo mpn/addmul_1.lo mpn/submul_1.lo mpn/add_err1_n$U.lo mpn/add_err2_n$U.lo mpn/add_err3_n$U.lo mpn/sub_err1_n$U.lo mpn/sub_err2_n$U.lo mpn/sub_err3_n$U.lo mpn/lshift.lo mpn/rshift.lo mpn/dive_1.lo mpn/diveby3$U.lo mpn/divis$U.lo mpn/divrem$U.lo mpn/divrem_1.lo mpn/divrem_2.lo mpn/fib2_ui$U.lo mpn/fib2m$U.lo mpn/mod_1$U.lo mpn/mod_34lsub1.lo mpn/mode1o.lo mpn/pre_mod_1$U.lo mpn/dump$U.lo mpn/mod_1_1.lo mpn/mod_1_2$U.lo mpn/mod_1_3$U.lo mpn/mod_1_4.lo mpn/lshiftc$U.lo mpn/mul$U.lo mpn/mul_fft$U.lo mpn/mul_n$U.lo mpn/sqr$U.lo mpn/mul_basecase.lo mpn/sqr_basecase.lo mpn/nussbaumer_mul$U.lo mpn/mulmid_basecase$U.lo mpn/toom42_mulmid$U.lo mpn/mulmid_n$U.lo mpn/mulmid$U.lo mpn/random$U.lo mpn/random2$U.lo mpn/pow_1$U.lo mpn/rootrem$U.lo mpn/sqrtrem$U.lo mpn/sizeinbase$U.lo mpn/get_str$U.lo mpn/set_str$U.lo mpn/compute_powtab$U.lo mpn/scan0$U.lo mpn/scan1$U.lo mpn/popcount.lo mpn/hamdist.lo mpn/cmp$U.lo mpn/zero_p$U.lo mpn/perfsqr$U.lo mpn/perfpow$U.lo mpn/strongfibo$U.lo mpn/gcd_11.lo mpn/gcd_22$U.lo mpn/gcd_1$U.lo mpn/gcd$U.lo mpn/gcdext_1$U.lo mpn/gcdext$U.lo mpn/gcd_subdiv_step$U.lo mpn/gcdext_lehmer$U.lo mpn/div_q$U.lo mpn/tdiv_qr$U.lo mpn/jacbase$U.lo mpn/jacobi_2$U.lo mpn/jacobi$U.lo mpn/get_d$U.lo mpn/matrix22_mul$U.lo mpn/matrix22_mul1_inverse_vector$U.lo mpn/hgcd_matrix$U.lo mpn/hgcd2$U.lo mpn/hgcd_step$U.lo mpn/hgcd_reduce$U.lo mpn/hgcd$U.lo mpn/hgcd_appr$U.lo mpn/hgcd2_jacobi$U.lo mpn/hgcd_jacobi$U.lo mpn/mullo_n$U.lo mpn/mullo_basecase$U.lo mpn/sqrlo$U.lo mpn/sqrlo_basecase$U.lo mpn/toom22_mul$U.lo mpn/toom32_mul$U.lo mpn/toom42_mul$U.lo mpn/toom52_mul$U.lo mpn/toom62_mul$U.lo mpn/toom33_mul$U.lo mpn/toom43_mul$U.lo mpn/toom53_mul$U.lo mpn/toom54_mul$U.lo mpn/toom63_mul$U.lo mpn/toom44_mul$U.lo mpn/toom6h_mul$U.lo mpn/toom6_sqr$U.lo mpn/toom8h_mul$U.lo mpn/toom8_sqr$U.lo mpn/toom_couple_handling$U.lo mpn/toom2_sqr$U.lo mpn/toom3_sqr$U.lo mpn/toom4_sqr$U.lo mpn/toom_eval_dgr3_pm1$U.lo mpn/toom_eval_dgr3_pm2$U.lo mpn/toom_eval_pm1$U.lo mpn/toom_eval_pm2$U.lo mpn/toom_eval_pm2exp$U.lo mpn/toom_eval_pm2rexp$U.lo mpn/toom_interpolate_5pts$U.lo mpn/toom_interpolate_6pts$U.lo mpn/toom_interpolate_7pts$U.lo mpn/toom_interpolate_8pts$U.lo mpn/toom_interpolate_12pts$U.lo mpn/toom_interpolate_16pts$U.lo mpn/invertappr$U.lo mpn/invert$U.lo mpn/binvert$U.lo mpn/mulmod_bnm1$U.lo mpn/sqrmod_bnm1$U.lo mpn/mulmod_bknp1$U.lo mpn/div_qr_1$U.lo mpn/div_qr_1n_pi1$U.lo mpn/div_qr_2$U.lo mpn/div_qr_2n_pi1$U.lo mpn/div_qr_2u_pi1$U.lo mpn/sbpi1_div_q$U.lo mpn/sbpi1_div_qr$U.lo mpn/sbpi1_divappr_q$U.lo mpn/dcpi1_div_q$U.lo mpn/dcpi1_div_qr$U.lo mpn/dcpi1_divappr_q$U.lo mpn/mu_div_qr$U.lo mpn/mu_divappr_q$U.lo mpn/mu_div_q$U.lo mpn/bdiv_q_1.lo mpn/sbpi1_bdiv_q$U.lo mpn/sbpi1_bdiv_qr$U.lo mpn/sbpi1_bdiv_r$U.lo mpn/dcpi1_bdiv_q$U.lo mpn/dcpi1_bdiv_qr$U.lo mpn/mu_bdiv_q$U.lo mpn/mu_bdiv_qr$U.lo mpn/bdiv_q$U.lo mpn/bdiv_qr$U.lo mpn/broot$U.lo mpn/brootinv$U.lo mpn/bsqrt$U.lo mpn/bsqrtinv$U.lo mpn/divexact$U.lo mpn/bdiv_dbm1c.lo mpn/redc_1$U.lo mpn/redc_2$U.lo mpn/redc_n$U.lo mpn/powm$U.lo mpn/powlo$U.lo mpn/sec_powm$U.lo mpn/sec_mul$U.lo mpn/sec_sqr$U.lo mpn/sec_div_qr$U.lo mpn/sec_div_r$U.lo mpn/sec_pi1_div_qr$U.lo mpn/sec_pi1_div_r$U.lo mpn/sec_add_1$U.lo mpn/sec_sub_1$U.lo mpn/sec_invert$U.lo mpn/trialdiv$U.lo mpn/remove$U.lo mpn/and_n$U.lo mpn/andn_n$U.lo mpn/nand_n$U.lo mpn/ior_n$U.lo mpn/iorn_n$U.lo mpn/nior_n$U.lo mpn/xor_n$U.lo mpn/xnor_n$U.lo mpn/copyi.lo mpn/copyd.lo mpn/zero$U.lo mpn/sec_tabselect.lo mpn/comb_tables$U.lo mpn/umul.lo mpn/udiv.lo mpn/add_n_sub_n$U.lo +oldincludedir = /usr/include +pdfdir = ${docdir} +prefix = /home/dnw/Code/ERA-calc/c-src/gmp-6.3.0/bin +program_transform_name = s,x,x, +psdir = ${docdir} +sbindir = ${exec_prefix}/sbin +sharedstatedir = ${prefix}/com +srcdir = . +sysconfdir = ${prefix}/etc +target_alias = +top_build_prefix = ../ +top_builddir = .. +top_srcdir = .. +AM_CPPFLAGS = -D__GMP_WITHIN_GMP -I$(top_srcdir) \ + -DOPERATION_`echo $* | sed 's/_$$//'` + +OFILES = add$U.lo add_1$U.lo add_n.lo sub$U.lo sub_1$U.lo sub_n.lo cnd_add_n.lo cnd_sub_n.lo cnd_swap$U.lo neg$U.lo com$U.lo mul_1.lo addmul_1.lo submul_1.lo add_err1_n$U.lo add_err2_n$U.lo add_err3_n$U.lo sub_err1_n$U.lo sub_err2_n$U.lo sub_err3_n$U.lo lshift.lo rshift.lo dive_1.lo diveby3$U.lo divis$U.lo divrem$U.lo divrem_1.lo divrem_2.lo fib2_ui$U.lo fib2m$U.lo mod_1$U.lo mod_34lsub1.lo mode1o.lo pre_mod_1$U.lo dump$U.lo mod_1_1.lo mod_1_2$U.lo mod_1_3$U.lo mod_1_4.lo lshiftc$U.lo mul$U.lo mul_fft$U.lo mul_n$U.lo sqr$U.lo mul_basecase.lo sqr_basecase.lo nussbaumer_mul$U.lo mulmid_basecase$U.lo toom42_mulmid$U.lo mulmid_n$U.lo mulmid$U.lo random$U.lo random2$U.lo pow_1$U.lo rootrem$U.lo sqrtrem$U.lo sizeinbase$U.lo get_str$U.lo set_str$U.lo compute_powtab$U.lo scan0$U.lo scan1$U.lo popcount.lo hamdist.lo cmp$U.lo zero_p$U.lo perfsqr$U.lo perfpow$U.lo strongfibo$U.lo gcd_11.lo gcd_22$U.lo gcd_1$U.lo gcd$U.lo gcdext_1$U.lo gcdext$U.lo gcd_subdiv_step$U.lo gcdext_lehmer$U.lo div_q$U.lo tdiv_qr$U.lo jacbase$U.lo jacobi_2$U.lo jacobi$U.lo get_d$U.lo matrix22_mul$U.lo matrix22_mul1_inverse_vector$U.lo hgcd_matrix$U.lo hgcd2$U.lo hgcd_step$U.lo hgcd_reduce$U.lo hgcd$U.lo hgcd_appr$U.lo hgcd2_jacobi$U.lo hgcd_jacobi$U.lo mullo_n$U.lo mullo_basecase$U.lo sqrlo$U.lo sqrlo_basecase$U.lo toom22_mul$U.lo toom32_mul$U.lo toom42_mul$U.lo toom52_mul$U.lo toom62_mul$U.lo toom33_mul$U.lo toom43_mul$U.lo toom53_mul$U.lo toom54_mul$U.lo toom63_mul$U.lo toom44_mul$U.lo toom6h_mul$U.lo toom6_sqr$U.lo toom8h_mul$U.lo toom8_sqr$U.lo toom_couple_handling$U.lo toom2_sqr$U.lo toom3_sqr$U.lo toom4_sqr$U.lo toom_eval_dgr3_pm1$U.lo toom_eval_dgr3_pm2$U.lo toom_eval_pm1$U.lo toom_eval_pm2$U.lo toom_eval_pm2exp$U.lo toom_eval_pm2rexp$U.lo toom_interpolate_5pts$U.lo toom_interpolate_6pts$U.lo toom_interpolate_7pts$U.lo toom_interpolate_8pts$U.lo toom_interpolate_12pts$U.lo toom_interpolate_16pts$U.lo invertappr$U.lo invert$U.lo binvert$U.lo mulmod_bnm1$U.lo sqrmod_bnm1$U.lo mulmod_bknp1$U.lo div_qr_1$U.lo div_qr_1n_pi1$U.lo div_qr_2$U.lo div_qr_2n_pi1$U.lo div_qr_2u_pi1$U.lo sbpi1_div_q$U.lo sbpi1_div_qr$U.lo sbpi1_divappr_q$U.lo dcpi1_div_q$U.lo dcpi1_div_qr$U.lo dcpi1_divappr_q$U.lo mu_div_qr$U.lo mu_divappr_q$U.lo mu_div_q$U.lo bdiv_q_1.lo sbpi1_bdiv_q$U.lo sbpi1_bdiv_qr$U.lo sbpi1_bdiv_r$U.lo dcpi1_bdiv_q$U.lo dcpi1_bdiv_qr$U.lo mu_bdiv_q$U.lo mu_bdiv_qr$U.lo bdiv_q$U.lo bdiv_qr$U.lo broot$U.lo brootinv$U.lo bsqrt$U.lo bsqrtinv$U.lo divexact$U.lo bdiv_dbm1c.lo redc_1$U.lo redc_2$U.lo redc_n$U.lo powm$U.lo powlo$U.lo sec_powm$U.lo sec_mul$U.lo sec_sqr$U.lo sec_div_qr$U.lo sec_div_r$U.lo sec_pi1_div_qr$U.lo sec_pi1_div_r$U.lo sec_add_1$U.lo sec_sub_1$U.lo sec_invert$U.lo trialdiv$U.lo remove$U.lo and_n$U.lo andn_n$U.lo nand_n$U.lo ior_n$U.lo iorn_n$U.lo nior_n$U.lo xor_n$U.lo xnor_n$U.lo copyi.lo copyd.lo zero$U.lo sec_tabselect.lo comb_tables$U.lo umul.lo udiv.lo add_n_sub_n$U.lo +noinst_LTLIBRARIES = libmpn.la +nodist_libmpn_la_SOURCES = fib_table.c mp_bases.c +libmpn_la_LIBADD = $(OFILES) +libmpn_la_DEPENDENCIES = $(OFILES) +TARG_DIST = alpha arm arm64 cray generic ia64 lisp loongarch m68k m88k \ + minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \ + riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64 + +EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST) + +# COMPILE minus CC. +# +COMPILE_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $(ASMFLAGS) + + +# Flags used for preprocessing (in ansi2knr rules). +# +PREPROCESS_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) + + +# Recent versions of automake (1.5 and up for instance) append automake +# generated suffixes to this $(SUFFIXES) list. This is essential for us, +# since .c must come after .s, .S and .asm. If .c is before .s, for +# instance, then in the mpn directory "make" will see add_n.c mentioned in +# an explicit rule (the ansi2knr stuff) and decide it must have add_n.c, +# even if add_n.c doesn't exist but add_n.s does. See GNU make +# documentation "(make)Implicit Rule Search", part 5c. +# +# On IRIX 6 native make this doesn't work properly though. Somehow .c +# remains ahead of .s, perhaps because .c.s is a builtin rule. .asm works +# fine though, and mpn/mips3 uses this. +# +SUFFIXES = .s .S .asm + +# can be overridden during development, eg. "make RM_TMP=: mul_1.lo" +RM_TMP = rm -f +all: all-am + +.SUFFIXES: +.SUFFIXES: .s .S .asm .c .lo .o .obj +$(srcdir)/Makefile.in: # $(srcdir)/Makefile.am $(srcdir)/Makeasm.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; +$(srcdir)/Makeasm.am $(am__empty): + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: # $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): # $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES) $(EXTRA_libmpn_la_DEPENDENCIES) + $(AM_V_CCLD)$(LINK) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +.c.o: + $(AM_V_CC)$(COMPILE) -c -o $@ $< + +.c.obj: + $(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.c.lo: + $(AM_V_CC)$(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \ + ctags-am distclean distclean-compile distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +# These are BUILT_SOURCES at the top-level, so normally they're built before +# recursing into this directory. +# +fib_table.c: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/fib_table.c +mp_bases.c: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/mp_bases.c +perfsqr.h: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h + +# .s assembler, no preprocessing. +# +.s.o: + $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< +.s.obj: + $(CCAS) $(COMPILE_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` +.s.lo: + $(LIBTOOL) --mode=compile --tag=CC $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# .S assembler, preprocessed with cpp. +# +# It's necessary to run $(CPP) separately, since it seems not all compilers +# recognise .S files, in particular "cc" on HP-UX 10 and 11 doesn't (and +# will silently do nothing if given a .S). +# +# For .lo we need a helper script, as described below for .asm.lo. +# +.S.o: + $(CPP) $(PREPROCESS_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.obj: + $(CPP) $(PREPROCESS_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/cpp-ccas --cpp="$(CPP) $(PREPROCESS_FLAGS)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# .asm assembler, preprocessed with m4. +# +# .o and .obj are non-PIC and just need m4 followed by a compile. +# +# .lo is a bit tricky. Libtool (as of version 1.5) has foo.lo as a little +# text file, and .libs/foo.o and foo.o as the PIC and non-PIC objects, +# respectively. It'd be asking for lots of trouble to try to create foo.lo +# ourselves, so instead arrange to invoke libtool like a --mode=compile, but +# with a special m4-ccas script which first m4 preprocesses, then compiles. +# --tag=CC is necessary since foo.asm is otherwise unknown to libtool. +# +# Libtool adds -DPIC when building a shared object and the .asm files look +# for that. But it should be noted that the other PIC flags are on occasion +# important too, in particular FreeBSD 2.2.8 gas 1.92.3 requires -k before +# it accepts PIC constructs like @GOT, and gcc adds that flag only under +# -fPIC. (Later versions of gas are happy to accept PIC stuff any time.) +# +.asm.o: + $(M4) -DOPERATION_$* `test -f '$<' || echo '$(srcdir)/'`$< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.obj: + $(M4) -DOPERATION_$* `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/gmp-6.3.0/mpn/Makefile.am b/gmp-6.3.0/mpn/Makefile.am new file mode 100644 index 0000000..c61926d --- /dev/null +++ b/gmp-6.3.0/mpn/Makefile.am @@ -0,0 +1,59 @@ +## Process this file with automake to generate Makefile.in + +# Copyright 1996, 1998-2002, 2005, 2011, 2013 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +AM_CPPFLAGS = -D__GMP_WITHIN_GMP -I$(top_srcdir) \ + -DOPERATION_`echo $* | sed 's/_$$//'` + +OFILES = @mpn_objects@ + +noinst_LTLIBRARIES = libmpn.la +nodist_libmpn_la_SOURCES = fib_table.c mp_bases.c +libmpn_la_LIBADD = $(OFILES) +libmpn_la_DEPENDENCIES = $(OFILES) + +TARG_DIST = alpha arm arm64 cray generic ia64 lisp loongarch m68k m88k \ + minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \ + riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64 + +EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST) + + +# These are BUILT_SOURCES at the top-level, so normally they're built before +# recursing into this directory. +# +fib_table.c: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/fib_table.c +mp_bases.c: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/mp_bases.c +perfsqr.h: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h + +include Makeasm.am diff --git a/gmp-6.3.0/mpn/Makefile.in b/gmp-6.3.0/mpn/Makefile.in new file mode 100644 index 0000000..b5df4e5 --- /dev/null +++ b/gmp-6.3.0/mpn/Makefile.in @@ -0,0 +1,772 @@ +# Makefile.in generated by automake 1.15 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2014 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# Copyright 1996, 1998-2002, 2005, 2011, 2013 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + +# Copyright 1996, 1998-2002 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + +VPATH = @srcdir@ +am__is_gnu_make = { \ + if test -z '$(MAKELEVEL)'; then \ + false; \ + elif test -n '$(MAKE_HOST)'; then \ + true; \ + elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ + true; \ + else \ + false; \ + fi; \ +} +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = mpn +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \ + $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +am__DEPENDENCIES_1 = +nodist_libmpn_la_OBJECTS = fib_table.lo mp_bases.lo +libmpn_la_OBJECTS = $(nodist_libmpn_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = +am__depfiles_maybe = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(nodist_libmpn_la_SOURCES) +DIST_SOURCES = +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +am__DIST_COMMON = $(srcdir)/Makeasm.am $(srcdir)/Makefile.in README +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ABI = @ABI@ +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AS = @AS@ +ASMFLAGS = @ASMFLAGS@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@ +CC = @CC@ +CCAS = @CCAS@ +CC_FOR_BUILD = @CC_FOR_BUILD@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CPP_FOR_BUILD = @CPP_FOR_BUILD@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFN_LONG_LONG_LIMB = @DEFN_LONG_LONG_LIMB@ +DEFS = @DEFS@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +EXEEXT_FOR_BUILD = @EXEEXT_FOR_BUILD@ +FGREP = @FGREP@ +GMP_LDFLAGS = @GMP_LDFLAGS@ +GMP_LIMB_BITS = @GMP_LIMB_BITS@ +GMP_NAIL_BITS = @GMP_NAIL_BITS@ +GREP = @GREP@ +HAVE_CLOCK_01 = @HAVE_CLOCK_01@ +HAVE_CPUTIME_01 = @HAVE_CPUTIME_01@ +HAVE_GETRUSAGE_01 = @HAVE_GETRUSAGE_01@ +HAVE_GETTIMEOFDAY_01 = @HAVE_GETTIMEOFDAY_01@ +HAVE_HOST_CPU_FAMILY_power = @HAVE_HOST_CPU_FAMILY_power@ +HAVE_HOST_CPU_FAMILY_powerpc = @HAVE_HOST_CPU_FAMILY_powerpc@ +HAVE_SIGACTION_01 = @HAVE_SIGACTION_01@ +HAVE_SIGALTSTACK_01 = @HAVE_SIGALTSTACK_01@ +HAVE_SIGSTACK_01 = @HAVE_SIGSTACK_01@ +HAVE_STACK_T_01 = @HAVE_STACK_T_01@ +HAVE_SYS_RESOURCE_H_01 = @HAVE_SYS_RESOURCE_H_01@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LIBCURSES = @LIBCURSES@ +LIBGMPXX_LDFLAGS = @LIBGMPXX_LDFLAGS@ +LIBGMP_DLL = @LIBGMP_DLL@ +LIBGMP_LDFLAGS = @LIBGMP_LDFLAGS@ +LIBM = @LIBM@ +LIBM_FOR_BUILD = @LIBM_FOR_BUILD@ +LIBOBJS = @LIBOBJS@ +LIBREADLINE = @LIBREADLINE@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@ +M4 = @M4@ +MAINT = @MAINT@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +SPEED_CYCLECOUNTER_OBJ = @SPEED_CYCLECOUNTER_OBJ@ +STRIP = @STRIP@ +TAL_OBJECT = @TAL_OBJECT@ +TUNE_LIBS = @TUNE_LIBS@ +TUNE_SQR_OBJ = @TUNE_SQR_OBJ@ +U_FOR_BUILD = @U_FOR_BUILD@ +VERSION = @VERSION@ +WITH_READLINE_01 = @WITH_READLINE_01@ +YACC = @YACC@ +YFLAGS = @YFLAGS@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__leading_dot = @am__leading_dot@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +gmp_srclinks = @gmp_srclinks@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +mpn_objects = @mpn_objects@ +mpn_objs_in_libgmp = @mpn_objs_in_libgmp@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +AM_CPPFLAGS = -D__GMP_WITHIN_GMP -I$(top_srcdir) \ + -DOPERATION_`echo $* | sed 's/_$$//'` + +OFILES = @mpn_objects@ +noinst_LTLIBRARIES = libmpn.la +nodist_libmpn_la_SOURCES = fib_table.c mp_bases.c +libmpn_la_LIBADD = $(OFILES) +libmpn_la_DEPENDENCIES = $(OFILES) +TARG_DIST = alpha arm arm64 cray generic ia64 lisp loongarch m68k m88k \ + minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \ + riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64 + +EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST) + +# COMPILE minus CC. +# +COMPILE_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $(ASMFLAGS) + + +# Flags used for preprocessing (in ansi2knr rules). +# +PREPROCESS_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) + + +# Recent versions of automake (1.5 and up for instance) append automake +# generated suffixes to this $(SUFFIXES) list. This is essential for us, +# since .c must come after .s, .S and .asm. If .c is before .s, for +# instance, then in the mpn directory "make" will see add_n.c mentioned in +# an explicit rule (the ansi2knr stuff) and decide it must have add_n.c, +# even if add_n.c doesn't exist but add_n.s does. See GNU make +# documentation "(make)Implicit Rule Search", part 5c. +# +# On IRIX 6 native make this doesn't work properly though. Somehow .c +# remains ahead of .s, perhaps because .c.s is a builtin rule. .asm works +# fine though, and mpn/mips3 uses this. +# +SUFFIXES = .s .S .asm + +# can be overridden during development, eg. "make RM_TMP=: mul_1.lo" +RM_TMP = rm -f +all: all-am + +.SUFFIXES: +.SUFFIXES: .s .S .asm .c .lo .o .obj +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/Makeasm.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; +$(srcdir)/Makeasm.am $(am__empty): + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES) $(EXTRA_libmpn_la_DEPENDENCIES) + $(AM_V_CCLD)$(LINK) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +.c.o: + $(AM_V_CC)$(COMPILE) -c -o $@ $< + +.c.obj: + $(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.c.lo: + $(AM_V_CC)$(LTCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \ + ctags-am distclean distclean-compile distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +.PRECIOUS: Makefile + + +# These are BUILT_SOURCES at the top-level, so normally they're built before +# recursing into this directory. +# +fib_table.c: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/fib_table.c +mp_bases.c: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/mp_bases.c +perfsqr.h: + cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h + +# .s assembler, no preprocessing. +# +.s.o: + $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< +.s.obj: + $(CCAS) $(COMPILE_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` +.s.lo: + $(LIBTOOL) --mode=compile --tag=CC $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# .S assembler, preprocessed with cpp. +# +# It's necessary to run $(CPP) separately, since it seems not all compilers +# recognise .S files, in particular "cc" on HP-UX 10 and 11 doesn't (and +# will silently do nothing if given a .S). +# +# For .lo we need a helper script, as described below for .asm.lo. +# +.S.o: + $(CPP) $(PREPROCESS_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.obj: + $(CPP) $(PREPROCESS_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.S.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/cpp-ccas --cpp="$(CPP) $(PREPROCESS_FLAGS)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# .asm assembler, preprocessed with m4. +# +# .o and .obj are non-PIC and just need m4 followed by a compile. +# +# .lo is a bit tricky. Libtool (as of version 1.5) has foo.lo as a little +# text file, and .libs/foo.o and foo.o as the PIC and non-PIC objects, +# respectively. It'd be asking for lots of trouble to try to create foo.lo +# ourselves, so instead arrange to invoke libtool like a --mode=compile, but +# with a special m4-ccas script which first m4 preprocesses, then compiles. +# --tag=CC is necessary since foo.asm is otherwise unknown to libtool. +# +# Libtool adds -DPIC when building a shared object and the .asm files look +# for that. But it should be noted that the other PIC flags are on occasion +# important too, in particular FreeBSD 2.2.8 gas 1.92.3 requires -k before +# it accepts PIC constructs like @GOT, and gcc adds that flag only under +# -fPIC. (Later versions of gas are happy to accept PIC stuff any time.) +# +.asm.o: + $(M4) -DOPERATION_$* `test -f '$<' || echo '$(srcdir)/'`$< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.obj: + $(M4) -DOPERATION_$* `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(RM_TMP) tmp-$*.s +.asm.lo: + $(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/gmp-6.3.0/mpn/README b/gmp-6.3.0/mpn/README new file mode 100644 index 0000000..bc046be --- /dev/null +++ b/gmp-6.3.0/mpn/README @@ -0,0 +1,44 @@ +Copyright 1996, 1999 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + +This directory contains all code for the mpn layer of GMP. + +Most subdirectories contain machine-dependent code, written in assembly or C. +The `generic' subdirectory contains default code, used when there is no +machine-dependent replacement for a particular machine. + +There is one subdirectory for each ISA family. Note that e.g., 32-bit SPARC +and 64-bit SPARC are very different ISA's, and thus cannot share any code. + +A particular compile will only use code from one subdirectory, and the +`generic' subdirectory. The ISA-specific subdirectories contain hierachies of +directories for various architecture variants and implementations; the +top-most level contains code that runs correctly on all variants. diff --git a/gmp-6.3.0/mpn/add.c b/gmp-6.3.0/mpn/add.c new file mode 120000 index 0000000..cffec90 --- /dev/null +++ b/gmp-6.3.0/mpn/add.c @@ -0,0 +1 @@ +../mpn/generic/add.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/add_1.c b/gmp-6.3.0/mpn/add_1.c new file mode 120000 index 0000000..6109f30 --- /dev/null +++ b/gmp-6.3.0/mpn/add_1.c @@ -0,0 +1 @@ +../mpn/generic/add_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/add_err1_n.c b/gmp-6.3.0/mpn/add_err1_n.c new file mode 120000 index 0000000..9f02b9a --- /dev/null +++ b/gmp-6.3.0/mpn/add_err1_n.c @@ -0,0 +1 @@ +../mpn/generic/add_err1_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/add_err2_n.c b/gmp-6.3.0/mpn/add_err2_n.c new file mode 120000 index 0000000..843fbe1 --- /dev/null +++ b/gmp-6.3.0/mpn/add_err2_n.c @@ -0,0 +1 @@ +../mpn/generic/add_err2_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/add_err3_n.c b/gmp-6.3.0/mpn/add_err3_n.c new file mode 120000 index 0000000..44b3b4a --- /dev/null +++ b/gmp-6.3.0/mpn/add_err3_n.c @@ -0,0 +1 @@ +../mpn/generic/add_err3_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/add_n.asm b/gmp-6.3.0/mpn/add_n.asm new file mode 120000 index 0000000..3f7fcac --- /dev/null +++ b/gmp-6.3.0/mpn/add_n.asm @@ -0,0 +1 @@ +../mpn/x86/p6/aors_n.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/add_n_sub_n.c b/gmp-6.3.0/mpn/add_n_sub_n.c new file mode 120000 index 0000000..d847fce --- /dev/null +++ b/gmp-6.3.0/mpn/add_n_sub_n.c @@ -0,0 +1 @@ +../mpn/generic/add_n_sub_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/addmul_1.asm b/gmp-6.3.0/mpn/addmul_1.asm new file mode 120000 index 0000000..31e7eb7 --- /dev/null +++ b/gmp-6.3.0/mpn/addmul_1.asm @@ -0,0 +1 @@ +../mpn/x86/p6/sse2/addmul_1.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/alpha/README b/gmp-6.3.0/mpn/alpha/README new file mode 100644 index 0000000..09c2f04 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/README @@ -0,0 +1,208 @@ +Copyright 1996, 1997, 1999-2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions optimized for DEC Alpha processors. + +ALPHA ASSEMBLY RULES AND REGULATIONS + +The `.prologue N' pseudo op marks the end of instruction that needs special +handling by unwinding. It also says whether $27 is really needed for computing +the gp. The `.mask M' pseudo op says which registers are saved on the stack, +and at what offset in the frame. + +Cray T3 code is very very different... + +"$6" / "$f6" etc is the usual syntax for registers, but on Unicos instead "r6" +/ "f6" is required. We use the "r6" / "f6" forms, and have m4 defines expand +them to "$6" or "$f6" where necessary. + +"0x" introduces a hex constant in gas and DEC as, but on Unicos "^X" is +required. The X() macro accommodates this difference. + +"cvttqc" is required by DEC as, "cvttq/c" is required by Unicos, and gas will +accept either. We use cvttqc and have an m4 define expand to cvttq/c where +necessary. + +"not" as an alias for "ornot r31, ..." is available in gas and DEC as, but not +the Unicos assembler. The full "ornot" must be used. + +"unop" is not available in Unicos. We make an m4 define to the usual "ldq_u +r31,0(r30)", and in fact use that define on all systems since it comes out the +same. + +"!literal!123" etc explicit relocations as per Tru64 4.0 are apparently not +available in older alpha assemblers (including gas prior to 2.12), according to +the GCC manual, so the assembler macro forms must be used (eg. ldgp). + + + +RELEVANT OPTIMIZATION ISSUES + +EV4 + +1. This chip has very limited store bandwidth. The on-chip L1 cache is write- + through, and a cache line is transferred from the store buffer to the off- + chip L2 in as much 15 cycles on most systems. This delay hurts mpn_add_n, + mpn_sub_n, mpn_lshift, and mpn_rshift. + +2. Pairing is possible between memory instructions and integer arithmetic + instructions. + +3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of these + cycles are pipelined. Thus, multiply instructions can be issued at a rate + of one each 21st cycle. + +EV5 + +1. The memory bandwidth of this chip is good, both for loads and stores. The + L1 cache can handle two loads or one store per cycle, but two cycles after a + store, no ld can issue. + +2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle. + umulh has a latency of 14 cycles and an issue rate of 1 each 10th cycle. + (Note that published documentation gets these numbers slightly wrong.) + +3. mpn_add_n. With 4-fold unrolling, we need 37 instructions, whereof 12 + are memory operations. This will take at least + ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles + We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data + cache cycles, which should be completely hidden in the 19 issue cycles. + The computation is inherently serial, with these dependencies: + + ldq ldq + \ /\ + (or) addq | + |\ / \ | + | addq cmpult + \ | | + cmpult | + \ / + or + + I.e., 3 operations are needed between carry-in and carry-out, making 12 + cycles the absolute minimum for the 4 limbs. We could replace the `or' with + a cmoveq/cmovne, which could issue one cycle earlier that the `or', but that + might waste a cycle on EV4. The total depth remain unaffected, since cmov + has a latency of 2 cycles. + + addq + / \ + addq cmpult + | \ + cmpult -> cmovne + + Montgomery has a slightly different way of computing carry that requires one + less instruction, but has depth 4 (instead of the current 3). Since the code + is currently instruction issue bound, Montgomery's idea should save us 1/2 + cycle per limb, or bring us down to a total of 17 cycles or 4.25 cycles/limb. + Unfortunately, this method will not be good for the EV6. + +4. addmul_1 and friends: We previously had a scheme for splitting the single- + limb operand in 21-bits chunks and the multi-limb operand in 32-bit chunks, + and then use FP operations for every 2nd multiply, and integer operations + for every 2nd multiply. + + But it seems much better to split the single-limb operand in 16-bit chunks, + since we save many integer shifts and adds that way. See powerpc64/README + for some more details. + +EV6 + +Here we have a really parallel pipeline, capable of issuing up to 4 integer +instructions per cycle. In actual practice, it is never possible to sustain +more than 3.5 integer insns/cycle due to rename register shortage. One integer +multiply instruction can issue each cycle. To get optimal speed, we need to +pretend we are vectorizing the code, i.e., minimize the depth of recurrences. + +There are two dependencies to watch out for. 1) Address arithmetic +dependencies, and 2) carry propagation dependencies. + +We can avoid serializing due to address arithmetic by unrolling loops, so that +addresses don't depend heavily on an index variable. Avoiding serializing +because of carry propagation is trickier; the ultimate performance of the code +will be determined of the number of latency cycles it takes from accepting +carry-in to a vector point until we can generate carry-out. + +Most integer instructions can execute in either the L0, U0, L1, or U1 +pipelines. Shifts only execute in U0 and U1, and multiply only in U1. + +CMOV instructions split into two internal instructions, CMOV1 and CMOV2. CMOV +split the mapping process (see pg 2-26 in cmpwrgd.pdf), suggesting the CMOV +should always be placed as the last instruction of an aligned 4 instruction +block, or perhaps simply avoided. + +Perhaps the most important issue is the latency between the L0/U0 and L1/U1 +clusters; a result obtained on either cluster has an extra cycle of latency for +consumers in the opposite cluster. Because of the dynamic nature of the +implementation, it is hard to predict where an instruction will execute. + + + +REFERENCES + +"Alpha Architecture Handbook", version 4, Compaq, October 1998, order number +EC-QD2KC-TE. + +"Alpha 21164 Microprocessor Hardware Reference Manual", Compaq, December 1998, +order number EC-QP99C-TE. + +"Alpha 21264/EV67 Microprocessor Hardware Reference Manual", revision 1.4, +Compaq, September 2000, order number DS-0028B-TE. + +"Compiler Writer's Guide for the Alpha 21264", Compaq, June 1999, order number +EC-RJ66A-TE. + +All of the above are available online from + + http://ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html + ftp://ftp.compaq.com/pub/products/alphaCPUdocs + +"Tru64 Unix Assembly Language Programmer's Guide", Compaq, March 1996, part +number AA-PS31D-TE. + +"Digital UNIX Calling Standard for Alpha Systems", Digital Equipment Corp, +March 1996, part number AA-PY8AC-TE. + +The above are available online, + + http://h30097.www3.hp.com/docs/pub_page/V40F_DOCS.HTM + +(Dunno what h30097 means in this URL, but if it moves try searching for "tru64 +online documentation" from the main www.hp.com page.) + + + +---------------- +Local variables: +mode: text +fill-column: 79 +End: diff --git a/gmp-6.3.0/mpn/alpha/add_n.asm b/gmp-6.3.0/mpn/alpha/add_n.asm new file mode 100644 index 0000000..bc572a5 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/add_n.asm @@ -0,0 +1,164 @@ +dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 4.75 +C EV6: 3 + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_nc) + bis r20,r31,r25 + br L(com) +EPILOGUE() +PROLOGUE(mpn_add_n) + bis r31,r31,r25 C clear cy +L(com): subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + addq r0,r4,r28 C 1st main add + ldq r2,16(r18) + addq r25,r28,r20 C 1st carry add + ldq r3,24(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r6,-16(r17) + cmpult r20,r28,r25 C compute cy from last add + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two adds + subq r19,4,r19 C decr loop cnt + addq r1,r5,r28 C 2nd main add + addq r18,32,r18 C update s2_ptr + addq r28,r25,r21 C 2nd carry add + cmpult r28,r5,r8 C compute cy from last add + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r21,r28,r25 C compute cy from last add + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two adds + ldq r1,8(r18) + addq r2,r6,r28 C 3rd main add + ldq r4,0(r17) + addq r28,r25,r22 C 3rd carry add + ldq r5,8(r17) + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + addq r0,r4,r28 C 1st main add + ldq r2,16(r18) + addq r25,r28,r20 C 1st carry add + ldq r3,24(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r6,-16(r17) + cmpult r20,r28,r25 C compute cy from last add + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two adds + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + addq r1,r5,r28 C 2nd main add + stq r23,-8(r16) + addq r25,r28,r21 C 2nd carry add + addq r18,32,r18 C update s2_ptr + cmpult r28,r5,r8 C compute cy from last add + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r21,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r2,r6,r28 C 3rd main add + addq r28,r25,r22 C 3rd carry add + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: addq r0,r4,r28 C main add + ldq r0,8(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r4,8(r17) + addq r28,r25,r20 C carry add + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r20,r28,r25 C compute cy from last add + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two adds + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: addq r0,r4,r28 C main add + addq r28,r25,r20 C carry add + cmpult r28,r4,r8 C compute cy from last add + cmpult r20,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/addmul_1.asm b/gmp-6.3.0/mpn/alpha/addmul_1.asm new file mode 100644 index 0000000..c4e6834 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/addmul_1.asm @@ -0,0 +1,99 @@ +dnl Alpha mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 7 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C n r18 +C vl r19 + + +ASM_START() +PROLOGUE(mpn_addmul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + addq r5,r3,r3 + cmpult r3,r5,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_addmul_1) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/alpha-defs.m4 b/gmp-6.3.0/mpn/alpha/alpha-defs.m4 new file mode 100644 index 0000000..af34c92 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/alpha-defs.m4 @@ -0,0 +1,107 @@ +divert(-1) + +dnl m4 macros for Alpha assembler. + +dnl Copyright 2003, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Usage: ASSERT([reg] [,code]) +dnl +dnl Require that the given reg is non-zero after executing the test code. +dnl For example, +dnl +dnl ASSERT(r8, +dnl ` cmpult r16, r17, r8') +dnl +dnl If the register argument is empty then nothing is tested, the code is +dnl just executed. This can be used for setups required by later ASSERTs. +dnl If the code argument is omitted then the register is just tested, with +dnl no special setup code. + +define(ASSERT, +m4_assert_numargs_range(1,2) +m4_assert_defined(`WANT_ASSERT') +`ifelse(WANT_ASSERT,1, +`ifelse(`$2',,,`$2') +ifelse(`$1',,, +` bne $1, L(ASSERTok`'ASSERT_label_counter) + .long 0 C halt +L(ASSERTok`'ASSERT_label_counter): +define(`ASSERT_label_counter',eval(ASSERT_label_counter+1)) +') +')') +define(`ASSERT_label_counter',1) + + +dnl Usage: bigend(`code') +dnl +dnl Emit the given code only for a big-endian system, like Unicos. This +dnl can be used for instance for extra stuff needed by extwl. + +define(bigend, +m4_assert_numargs(1) +`ifdef(`HAVE_LIMB_BIG_ENDIAN',`$1', +`ifdef(`HAVE_LIMB_LITTLE_ENDIAN',`', +`m4_error(`Cannot assemble, unknown limb endianness')')')') + + +dnl Usage: bwx_available_p +dnl +dnl Evaluate to 1 if the BWX byte memory instructions are available, or to +dnl 0 if not. +dnl +dnl Listing the chips which do have BWX means anything we haven't looked at +dnl will use safe non-BWX code. The only targets without BWX currently are +dnl plain alpha (ie. ev4) and alphaev5. + +define(bwx_available_p, +m4_assert_numargs(-1) +`m4_ifdef_anyof_p( + `HAVE_HOST_CPU_alphaev56', + `HAVE_HOST_CPU_alphapca56', + `HAVE_HOST_CPU_alphapca57', + `HAVE_HOST_CPU_alphaev6', + `HAVE_HOST_CPU_alphaev67', + `HAVE_HOST_CPU_alphaev68', + `HAVE_HOST_CPU_alphaev69', + `HAVE_HOST_CPU_alphaev7', + `HAVE_HOST_CPU_alphaev79')') + + +dnl Usage: unop +dnl +dnl The Cray Unicos assembler lacks unop, so give the equivalent ldq_u +dnl explicitly. + +define(unop, +m4_assert_numargs(-1) +`ldq_u r31, 0(r30)') + + +divert diff --git a/gmp-6.3.0/mpn/alpha/aorslsh1_n.asm b/gmp-6.3.0/mpn/alpha/aorslsh1_n.asm new file mode 100644 index 0000000..9525e66 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/aorslsh1_n.asm @@ -0,0 +1,164 @@ +dnl Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1). + +dnl Copyright 2003, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 6.25 +C EV6: 4.5 + +define(`rp',`r16') +define(`up',`r17') +define(`vp',`r18') +define(`n', `r19') + +define(`u0', `r8') +define(`u1', `r1') +define(`v0', `r4') +define(`v1', `r5') + +define(`cy0', `r0') +define(`cy1', `r20') +define(`cy', `r22') +define(`rr', `r24') +define(`ps', `r25') +define(`sl', `r28') + +ifdef(`OPERATION_addlsh1_n',` + define(ADDSUB, addq) + define(CARRY, `cmpult $1,$2,$3') + define(func, mpn_addlsh1_n) +') +ifdef(`OPERATION_sublsh1_n',` + define(ADDSUB, subq) + define(CARRY, `cmpult $2,$1,$3') + define(func, mpn_sublsh1_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + +ASM_START() +PROLOGUE(func) + and n, 2, cy0 + blbs n, L(bx1) +L(bx0): ldq v1, 0(vp) + ldq u1, 0(up) + nop + bne cy0, L(b10) + +L(b00): lda vp, 48(vp) + lda up, -16(up) + lda rp, -8(rp) + br r31, L(lo0) + +L(b10): lda vp, 32(vp) + lda rp, 8(rp) + lda cy0, 0(r31) + br r31, L(lo2) + +L(bx1): ldq v0, 0(vp) + ldq u0, 0(up) + lda cy1, 0(r31) + beq cy0, L(b01) + +L(b11): lda vp, 40(vp) + lda up, -24(up) + lda rp, 16(rp) + br r31, L(lo3) + +L(b01): lda n, -4(n) + ble n, L(end) + lda vp, 24(vp) + lda up, -8(up) + + ALIGN(16) +L(top): addq v0, v0, sl C left shift vlimb + ldq v1, -16(vp) + ADDSUB u0, sl, ps C ulimb + (vlimb << 1) + cmplt v0, r31, cy0 C carry out #1 + ldq u1, 16(up) + ADDSUB ps, cy1, rr C consume carry from previous operation + CARRY( ps, u0, cy) C carry out #2 + stq rr, 0(rp) + addq cy, cy0, cy0 C combine carry out #1 and #2 + CARRY( rr, ps, cy) C carry out #3 + addq cy, cy0, cy0 C final carry out + lda vp, 32(vp) C bookkeeping +L(lo0): addq v1, v1, sl + ldq v0, -40(vp) + ADDSUB u1, sl, ps + cmplt v1, r31, cy1 + ldq u0, 24(up) + ADDSUB ps, cy0, rr + CARRY( ps, u1, cy) + stq rr, 8(rp) + addq cy, cy1, cy1 + CARRY( rr, ps, cy) + addq cy, cy1, cy1 + lda rp, 32(rp) C bookkeeping +L(lo3): addq v0, v0, sl + ldq v1, -32(vp) + ADDSUB u0, sl, ps + cmplt v0, r31, cy0 + ldq u1, 32(up) + ADDSUB ps, cy1, rr + CARRY( ps, u0, cy) + stq rr, -16(rp) + addq cy, cy0, cy0 + CARRY( rr, ps, cy) + addq cy, cy0, cy0 + lda up, 32(up) C bookkeeping +L(lo2): addq v1, v1, sl + ldq v0, -24(vp) + ADDSUB u1, sl, ps + cmplt v1, r31, cy1 + ldq u0, 8(up) + ADDSUB ps, cy0, rr + CARRY( ps, u1, cy) + stq rr, -8(rp) + addq cy, cy1, cy1 + CARRY( rr, ps, cy) + addq cy, cy1, cy1 + lda n, -4(n) C bookkeeping + bgt n, L(top) + +L(end): addq v0, v0, sl + ADDSUB u0, sl, ps + ADDSUB ps, cy1, rr + cmplt v0, r31, cy0 + CARRY( ps, u0, cy) + stq rr, 0(rp) + addq cy, cy0, cy0 + CARRY( rr, ps, cy) + addq cy, cy0, r0 + ret r31,(r26),1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/aorslsh2_n.asm b/gmp-6.3.0/mpn/alpha/aorslsh2_n.asm new file mode 100644 index 0000000..bdee1d6 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/aorslsh2_n.asm @@ -0,0 +1,167 @@ +dnl Alpha mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2). + +dnl Copyright 2003, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 6 +C EV6: 3.75 + +C TODO +C * Tune to reach 3.5 c/l on ev6 and 5.75 c/l on ev5. + +define(`rp',`r16') +define(`up',`r17') +define(`vp',`r18') +define(`n', `r19') + +define(`u0', `r8') +define(`u1', `r1') +define(`v0', `r4') +define(`v1', `r5') + +define(`cy0', `r0') +define(`cy1', `r20') +define(`cy', `r22') +define(`rr', `r24') +define(`ps', `r25') +define(`sl', `r28') + +ifdef(`OPERATION_addlsh2_n',` + define(ADDSUB, addq) + define(CARRY, `cmpult $1,$2,$3') + define(func, mpn_addlsh2_n) +') +ifdef(`OPERATION_sublsh2_n',` + define(ADDSUB, subq) + define(CARRY, `cmpult $2,$1,$3') + define(func, mpn_sublsh2_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n) + +ASM_START() +PROLOGUE(func) + and n, 2, cy0 + blbs n, L(bx1) +L(bx0): ldq v1, 0(vp) + ldq u1, 0(up) + bis r31, r31, r2 + bne cy0, L(b10) + +L(b00): lda vp, 48(vp) + lda up, -16(up) + lda rp, -8(rp) + s4addq v1, r31, sl + br r31, L(lo0) + +L(b10): lda vp, 32(vp) + lda rp, 8(rp) + lda cy0, 0(r31) + br r31, L(lo2) + +L(bx1): ldq v0, 0(vp) + ldq u0, 0(up) + lda cy1, 0(r31) + bis r31, r31, r3 + nop + beq cy0, L(b01) + +L(b11): lda vp, 40(vp) + lda up, -24(up) + lda rp, 16(rp) + br r31, L(lo3) + +L(b01): lda n, -4(n) + ble n, L(end) + lda vp, 24(vp) + lda up, -8(up) + + ALIGN(16) +L(top): s4addq v0, r3, sl C combined vlimb + ldq v1, -16(vp) + ADDSUB u0, sl, ps C ulimb + (vlimb << 1) + ldq u1, 16(up) + srl v0, 62, r2 C high v bits + ADDSUB ps, cy1, rr C consume carry from previous operation + CARRY( ps, u0, cy0) C carry out #2 + stq rr, 0(rp) + CARRY( rr, ps, cy) C carry out #3 + lda vp, 32(vp) C bookkeeping + addq cy, cy0, cy0 C final carry out + s4addq v1, r2, sl +L(lo0): ldq v0, -40(vp) + ADDSUB u1, sl, ps + ldq u0, 24(up) + srl v1, 62, r3 + ADDSUB ps, cy0, rr + CARRY( ps, u1, cy1) + stq rr, 8(rp) + CARRY( rr, ps, cy) + lda rp, 32(rp) C bookkeeping + addq cy, cy1, cy1 +L(lo3): s4addq v0, r3, sl + ldq v1, -32(vp) + ADDSUB u0, sl, ps + ldq u1, 32(up) + srl v0, 62, r2 + ADDSUB ps, cy1, rr + CARRY( ps, u0, cy0) + stq rr, -16(rp) + CARRY( rr, ps, cy) + lda up, 32(up) C bookkeeping + addq cy, cy0, cy0 +L(lo2): s4addq v1, r2, sl + ldq v0, -24(vp) + ADDSUB u1, sl, ps + ldq u0, 8(up) + srl v1, 62, r3 + ADDSUB ps, cy0, rr + CARRY( ps, u1, cy1) + stq rr, -8(rp) + CARRY( rr, ps, cy) + lda n, -4(n) C bookkeeping + addq cy, cy1, cy1 + bgt n, L(top) + +L(end): s4addq v0, r3, sl + ADDSUB u0, sl, ps + srl v0, 62, r2 + ADDSUB ps, cy1, rr + CARRY( ps, u0, cy0) + stq rr, 0(rp) + CARRY( rr, ps, cy) + addq cy, cy0, cy0 + addq cy0, r2, r0 + + ret r31,(r26),1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/alpha/bdiv_dbm1c.asm new file mode 100644 index 0000000..472966c --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/bdiv_dbm1c.asm @@ -0,0 +1,282 @@ +dnl Alpha mpn_bdiv_dbm1c. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 3 + +C TODO +C * Try less unrolling, 2-way should give the same performance. +C * Optimize feed-in and wind-down code, for speed, and perhaps further for +C code size. +C * This runs optimally given the algorithm, r8 is on a 3 operation recurrency +C path. We have not tried very hard to find a better algorithm. Perhaps +C it would be a good task for the GNU superoptimizer. + +C INPUT PARAMETERS +define(`rp', `r16') +define(`up', `r17') +define(`n', `r18') +define(`bd', `r19') +define(`cy', `r19') + + +ASM_START() +PROLOGUE(mpn_bdiv_dbm1c) + mov r20, r8 + + ldq r24, 0(r17) + and r18, 3, r28 + lda r18, -4(r18) + beq r28, L(b0) + cmpeq r28, 1, r21 + bne r21, L(b1) + cmpeq r28, 2, r21 + bne r21, L(b2) + + +L(b3): ldq r2, 8(r17) + ldq r3, 16(r17) + bgt r18, L(gt3) + + mulq r24, r19, r5 C U1 + umulh r24, r19, r21 C U1 + mulq r2, r19, r6 C U1 + umulh r2, r19, r22 C U1 + mulq r3, r19, r7 C U1 + umulh r3, r19, r23 C U1 + lda r16, -32(r16) + br L(cj3) + +L(gt3): ldq r0, 24(r17) + mulq r24, r19, r5 C U1 + umulh r24, r19, r21 C U1 + ldq r1, 32(r17) + mulq r2, r19, r6 C U1 + umulh r2, r19, r22 C U1 + ldq r2, 40(r17) + mulq r3, r19, r7 C U1 + umulh r3, r19, r23 C U1 + ldq r3, 48(r17) + lda r18, -4(r18) + lda r17, 56(r17) + mulq r0, r19, r4 C U1 + bgt r18, L(L3) + + br L(cj7) + + +L(b2): ldq r3, 8(r17) + bgt r18, L(gt2) + + mulq r24, r19, r6 C U1 + umulh r24, r19, r22 C U1 + mulq r3, r19, r7 C U1 + umulh r3, r19, r23 C U1 + lda r16, -40(r16) + br L(cj2) + +L(gt2): ldq r0, 16(r17) + ldq r1, 24(r17) + mulq r24, r19, r6 C U1 + umulh r24, r19, r22 C U1 + ldq r2, 32(r17) + mulq r3, r19, r7 C U1 + umulh r3, r19, r23 C U1 + ldq r3, 40(r17) + lda r18, -4(r18) + lda r17, 48(r17) + mulq r0, r19, r4 C U1 + umulh r0, r19, r20 C U1 + lda r16, -8(r16) + bgt r18, L(gt6) + + mulq r1, r19, r5 C U1 + br L(cj6) + +L(gt6): ldq r0, 0(r17) + mulq r1, r19, r5 C U1 + br L(L2) + + +L(b1): bgt r18, L(gt1) + + mulq r24, r19, r7 C U1 + umulh r24, r19, r23 C U1 + lda r16, -48(r16) + br L(cj1) + +L(gt1): ldq r0, 8(r17) + ldq r1, 16(r17) + ldq r2, 24(r17) + mulq r24, r19, r7 C U1 + umulh r24, r19, r23 C U1 + ldq r3, 32(r17) + lda r18, -4(r18) + lda r17, 40(r17) + mulq r0, r19, r4 C U1 + umulh r0, r19, r20 C U1 + lda r16, -16(r16) + bgt r18, L(gt5) + + mulq r1, r19, r5 C U1 + umulh r1, r19, r21 C U1 + mulq r2, r19, r6 C U1 + br L(cj5) + +L(gt5): ldq r0, 0(r17) + mulq r1, r19, r5 C U1 + umulh r1, r19, r21 C U1 + ldq r1, 8(r17) + mulq r2, r19, r6 C U1 + br L(L1) + + +L(b0): ldq r1, 8(r17) + ldq r2, 16(r17) + ldq r3, 24(r17) + lda r17, 32(r17) + lda r16, -24(r16) + mulq r24, r19, r4 C U1 + umulh r24, r19, r20 C U1 + bgt r18, L(gt4) + + mulq r1, r19, r5 C U1 + umulh r1, r19, r21 C U1 + mulq r2, r19, r6 C U1 + umulh r2, r19, r22 C U1 + mulq r3, r19, r7 C U1 + br L(cj4) + +L(gt4): ldq r0, 0(r17) + mulq r1, r19, r5 C U1 + umulh r1, r19, r21 C U1 + ldq r1, 8(r17) + mulq r2, r19, r6 C U1 + umulh r2, r19, r22 C U1 + ldq r2, 16(r17) + mulq r3, r19, r7 C U1 + br L(L0) + +C *** MAIN LOOP START *** + ALIGN(16) +L(top): mulq r0, r19, r4 C U1 + subq r8, r28, r8 +L(L3): umulh r0, r19, r20 C U1 + cmpult r8, r5, r28 + ldq r0, 0(r17) + subq r8, r5, r8 + addq r21, r28, r28 + stq r8, 0(r16) + + mulq r1, r19, r5 C U1 + subq r8, r28, r8 +L(L2): umulh r1, r19, r21 C U1 + cmpult r8, r6, r28 + ldq r1, 8(r17) + subq r8, r6, r8 + addq r22, r28, r28 + stq r8, 8(r16) + + mulq r2, r19, r6 C U1 + subq r8, r28, r8 +L(L1): umulh r2, r19, r22 C U1 + cmpult r8, r7, r28 + ldq r2, 16(r17) + subq r8, r7, r8 + addq r23, r28, r28 + stq r8, 16(r16) + + mulq r3, r19, r7 C U1 + subq r8, r28, r8 +L(L0): umulh r3, r19, r23 C U1 + cmpult r8, r4, r28 + ldq r3, 24(r17) + subq r8, r4, r8 + addq r20, r28, r28 + stq r8, 24(r16) + + lda r18, -4(r18) + lda r17, 32(r17) + lda r16, 32(r16) + bgt r18, L(top) +C *** MAIN LOOP END *** + + mulq r0, r19, r4 C U1 + subq r8, r28, r8 +L(cj7): umulh r0, r19, r20 C U1 + cmpult r8, r5, r28 + subq r8, r5, r8 + addq r21, r28, r28 + stq r8, 0(r16) + mulq r1, r19, r5 C U1 + subq r8, r28, r8 +L(cj6): umulh r1, r19, r21 C U1 + cmpult r8, r6, r28 + subq r8, r6, r8 + addq r22, r28, r28 + stq r8, 8(r16) + mulq r2, r19, r6 C U1 + subq r8, r28, r8 +L(cj5): umulh r2, r19, r22 C U1 + cmpult r8, r7, r28 + subq r8, r7, r8 + addq r23, r28, r28 + stq r8, 16(r16) + mulq r3, r19, r7 C U1 + subq r8, r28, r8 +L(cj4): umulh r3, r19, r23 C U1 + cmpult r8, r4, r28 + subq r8, r4, r8 + addq r20, r28, r28 + stq r8, 24(r16) + subq r8, r28, r8 +L(cj3): cmpult r8, r5, r28 + subq r8, r5, r8 + addq r21, r28, r28 + stq r8, 32(r16) + subq r8, r28, r8 +L(cj2): cmpult r8, r6, r28 + subq r8, r6, r8 + addq r22, r28, r28 + stq r8, 40(r16) + subq r8, r28, r8 +L(cj1): cmpult r8, r7, r28 + subq r8, r7, r8 + addq r23, r28, r28 + stq r8, 48(r16) + subq r8, r28, r0 + ret r31, (r26), 1 + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/cntlz.asm b/gmp-6.3.0/mpn/alpha/cntlz.asm new file mode 100644 index 0000000..25af19b --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/cntlz.asm @@ -0,0 +1,55 @@ +dnl Alpha auxiliary for longlong.h's count_leading_zeros + +dnl Copyright 1997, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +ASM_START() +EXTERN(__clz_tab) +PROLOGUE(mpn_count_leading_zeros,gp) + cmpbge r31, r16, r1 + LEA(r3,__clz_tab) + sra r1, 1, r1 + xor r1, 127, r1 + srl r16, 1, r16 + addq r1, r3, r1 + ldq_u r0, 0(r1) + lda r2, 64 + extbl r0, r1, r0 + s8subl r0, 8, r0 + srl r16, r0, r16 + addq r16, r3, r16 + ldq_u r1, 0(r16) + extbl r1, r16, r1 + subq r2, r1, r2 + subq r2, r0, r0 + ret r31, (r26),1 +EPILOGUE(mpn_count_leading_zeros) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/com.asm b/gmp-6.3.0/mpn/alpha/com.asm new file mode 100644 index 0000000..f084ab5 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/com.asm @@ -0,0 +1,176 @@ +dnl Alpha mpn_com -- mpn one's complement. + +dnl Copyright 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C EV4: 4.75 +C EV5: 2.0 +C EV6: 1.5 + + +C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total +C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop +C will be 1.5+2/N c/l. +C +C 2 cycles of loop control are unavoidable, for pointer updates and the +C taken branch bubble, but also since ldq cannot issue two cycles after stq +C (and with a run of stqs that means neither of two cycles at the end of the +C loop. +C +C The fbeq is forced into the second cycle of the loop using unops, since +C the first time through it must wait for the cvtqt result. Once that +C result is ready (a 1 cycle stall) then both the branch and following loads +C can issue together. +C +C The main loop handles an odd count of limbs, being two limbs loaded before +C each size test, plus one pipelined around from the previous iteration (or +C setup in the entry sequence). +C +C An even number of limbs is handled by an explicit dst[0]=~src[0] in the +C entry sequence, and an increment of the pointers. For an odd size there's +C no increment and the first store in the loop (r24) is a repeat of dst[0]. +C +C Note that the load for r24 after the possible pointer increment is done +C before the explicit store to dst[0], in case src==dst. + + +ASM_START() + +FLOAT64(L(dat), 2.0) + + ALIGN(16) + +PROLOGUE(mpn_com,gp) + + C r16 dst + C r17 src + C r18 size + + lda r30, -16(r30) C temporary stack space + lda r7, -3(r18) C size - 3 + + ldq r20, 0(r17) C src[0] + srl r7, 1, r6 C (size-3)/2 + + stq r6, 8(r30) C (size-3)/2 + and r7, 1, r5 C 1 if size even + + LEA( r8, L(dat)) + s8addq r5, r17, r17 C skip src[0] if even + + ornot r31, r20, r20 C ~src[0] + unop + + ldt f0, 8(r30) C (size-3)/2 + ldq r24, 0(r17) C src[0 or 1] + + stq r20, 0(r16) C dst[0] + s8addq r5, r16, r19 C skip dst[0] if even + + ldt f1, 0(r8) C data 2.0 + lda r30, 16(r30) C restore stack + unop + cvtqt f0, f0 C (size-3)/2 as float + + ornot r31, r24, r24 + blt r7, L(done_1) C if size<=2 + unop + unop + + + C 16-byte alignment here +L(top): + C r17 src, incrementing + C r19 dst, incrementing + C r24 dst[i] result, ready to store + C f0 (size-3)/2, decrementing + C f1 2.0 + + ldq r20, 8(r17) C src[i+1] + ldq r21, 16(r17) C src[i+2] + unop + unop + + fbeq f0, L(done_2) + unop + ldq r22, 24(r17) C src[i+3] + ldq r23, 32(r17) C src[i+4] + + stq r24, 0(r19) C dst[i] + ornot r31, r20, r20 + subt f0, f1, f0 C count -= 2 + unop + + stq r20, 8(r19) C dst[i+1] + ornot r31, r21, r21 + unop + unop + + stq r21, 16(r19) C dst[i+2] + ornot r31, r22, r22 + + stq r22, 24(r19) C dst[i+3] + ornot r31, r23, r24 + + lda r17, 32(r17) C src += 4 + lda r19, 32(r19) C dst += 4 + unop + fbge f0, L(top) + + +L(done_1): + C r19 &dst[size-1] + C r24 result for dst[size-1] + + stq r24, 0(r19) C dst[size-1] + ret r31, (r26), 1 + + +L(done_2): + C r19 &dst[size-3] + C r20 src[size-2] + C r21 src[size-1] + C r24 result for dst[size-3] + + stq r24, 0(r19) C dst[size-3] + ornot r31, r20, r20 + + stq r20, 8(r19) C dst[size-2] + ornot r31, r21, r21 + + stq r21, 16(r19) C dst[size-1] + ret r31, (r26), 1 + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/copyd.asm b/gmp-6.3.0/mpn/alpha/copyd.asm new file mode 100644 index 0000000..b41b536 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/copyd.asm @@ -0,0 +1,88 @@ +dnl Alpha mpn_copyd -- copy, decrementing. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 4 +C EV5: 1.75 +C EV6: 1 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C n r18 + + +ASM_START() +PROLOGUE(mpn_copyd) + s8addq r18,r16,r16 C E0 + s8addq r18,r17,r17 C E1 + lda r18,-8(r18) C E0 + blt r18,$Lend C E1 +$Loop: ldq r0,-8(r17) C E0 + ldq r1,-16(r17) C E1 + ldq r2,-24(r17) C E0 + ldq r3,-32(r17) C E1 + ldq r4,-40(r17) C E0 + ldq r5,-48(r17) C E1 + ldq r6,-56(r17) C E0 + ldq r7,-64(r17) C E1 + stq r0,-8(r16) C E0 + lda r17,-64(r17) C E1 + stq r1,-16(r16) C E0 + bis r31, r31, r31 C E1 + stq r2,-24(r16) C E0 + lda r18,-8(r18) C E1 + stq r3,-32(r16) C E0 + bis r31, r31, r31 C E1 + stq r4,-40(r16) C E0 + bis r31, r31, r31 C E1 + stq r5,-48(r16) C E0 + bis r31, r31, r31 C E1 + stq r6,-56(r16) C E0 + bis r31, r31, r31 C E1 + stq r7,-64(r16) C E0 + lda r16,-64(r16) C E1 + bge r18,$Loop C E1 +$Lend: lda r18,7(r18) C E0 + blt r18,$Lret C E1 + ldq r0,-8(r17) C E0 + beq r18,$Lend0 C E1 +$Loop0: stq r0,-8(r16) C E0 + lda r16,-8(r16) C E1 + ldq r0,-16(r17) C E0 + lda r18,-1(r18) C E1 + lda r17,-8(r17) C E0 + bgt r18,$Loop0 C E1 +$Lend0: stq r0,-8(r16) C E0 +$Lret: ret r31,(r26),1 C E1 +EPILOGUE(mpn_copyd) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/copyi.asm b/gmp-6.3.0/mpn/alpha/copyi.asm new file mode 100644 index 0000000..f7e2ad6 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/copyi.asm @@ -0,0 +1,86 @@ +dnl Alpha mpn_copyi -- copy, incrementing. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 4 +C EV5: 1.75 +C EV6: 1 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C n r18 + + +ASM_START() +PROLOGUE(mpn_copyi) + lda r18,-8(r18) C E0 + blt r18,$Lend C E1 +$Loop: ldq r0,0(r17) C E0 + ldq r1,8(r17) C E1 + ldq r2,16(r17) C E0 + ldq r3,24(r17) C E1 + ldq r4,32(r17) C E0 + ldq r5,40(r17) C E1 + ldq r6,48(r17) C E0 + ldq r7,56(r17) C E1 + stq r0,0(r16) C E0 + lda r17,64(r17) C E1 + stq r1,8(r16) C E0 + bis r31, r31, r31 C E1 + stq r2,16(r16) C E0 + lda r18,-8(r18) C E1 + stq r3,24(r16) C E0 + bis r31, r31, r31 C E1 + stq r4,32(r16) C E0 + bis r31, r31, r31 C E1 + stq r5,40(r16) C E0 + bis r31, r31, r31 C E1 + stq r6,48(r16) C E0 + bis r31, r31, r31 C E1 + stq r7,56(r16) C E0 + lda r16,64(r16) C E1 + bge r18,$Loop C E1 +$Lend: lda r18,7(r18) C E0 + blt r18,$Lret C E1 + ldq r0,0(r17) C E0 + beq r18,$Lend0 C E1 +$Loop0: stq r0,0(r16) C E0 + lda r16,8(r16) C E1 + ldq r0,8(r17) C E0 + lda r18,-1(r18) C E1 + lda r17,8(r17) C E0 + bgt r18,$Loop0 C E1 +$Lend0: stq r0,0(r16) C E0 +$Lret: ret r31,(r26),1 C E1 +EPILOGUE(mpn_copyi) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/default.m4 b/gmp-6.3.0/mpn/alpha/default.m4 new file mode 100644 index 0000000..8fe7c4e --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/default.m4 @@ -0,0 +1,127 @@ +divert(-1) + +dnl m4 macros for alpha assembler (everywhere except unicos). + + +dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Usage: ASM_START() +define(`ASM_START', +m4_assert_numargs(0) +` .set noreorder + .set noat') + +dnl Usage: X(value) +define(`X', +m4_assert_numargs(1) +`0x$1') + +dnl Usage: FLOAT64(label,value) +define(`FLOAT64', +m4_assert_numargs(2) +` .align 3 +$1: .t_floating $2') + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',gp,, +`ifelse(`$2',noalign,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter +')')')')dnl + .text +ifelse(`$2',noalign,,` ALIGN(16)') + .globl $1 + .ent $1 +$1: + .frame r30,0,r26,0 +ifelse(`$2',gp,` ldgp r29, 0(r27) +`$'$1..ng:') + .prologue ifelse(`$2',gp,1,0)') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .end $1') + + +dnl Usage: LDGP(dst,src) +dnl +dnl Emit an "ldgp dst,src", but only if the system uses a GOT. + +define(LDGP, +m4_assert_numargs(2) +`ldgp `$1', `$2'') + + +dnl Usage: EXTERN(variable_name) +define(`EXTERN', +m4_assert_numargs(1) +) + +dnl Usage: r0 ... r31 +dnl f0 ... f31 +dnl +dnl Map register names r0 to $0, and f0 to $f0, etc. +dnl This is needed on all systems but Unicos +dnl +dnl defreg() is used to protect the $ in $0 (otherwise it would represent a +dnl macro argument). Double quoting is used to protect the f0 in $f0 +dnl (otherwise it would be an infinite recursion). + +forloop(i,0,31,`defreg(`r'i,$i)') +forloop(i,0,31,`deflit(`f'i,``$f''i)') + + +dnl Usage: DATASTART(name,align) or DATASTART(name) +dnl DATAEND() + +define(`DATASTART', +m4_assert_numargs_range(1,2) +` RODATA + ALIGN(ifelse($#,1,2,$2)) +$1:') +define(`DATAEND', +m4_assert_numargs(0) +) + +dnl Load a symbolic address into a register +define(`LEA', +m4_assert_numargs(2) +`lda $1, $2') + +dnl Usage: ASM_END() +define(`ASM_END', +m4_assert_numargs(0) +) + +divert diff --git a/gmp-6.3.0/mpn/alpha/dive_1.c b/gmp-6.3.0/mpn/alpha/dive_1.c new file mode 100644 index 0000000..349d581 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/dive_1.c @@ -0,0 +1,114 @@ +/* Alpha mpn_divexact_1 -- mpn by limb exact division. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* cycles/limb + EV4: 47.0 + EV5: 30.0 + EV6: 15.0 +*/ + + +/* The dependent chain is as follows (the same as modexact), and this is + what the code runs as. + + ev4 ev5 ev6 + 1 1 1 sub y = x - h + 23 13 7 mulq q = y * inverse + 23 15 7 umulh h = high (q * d) + -- -- -- + 47 30 15 + + The time to load src[i+1] and establish x hides under the umulh latency. */ + +void +mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) +{ + mp_limb_t inverse, lshift_mask, s, sr, s_next, c, h, x, y, q, dummy; + unsigned rshift, lshift; + + ASSERT (size >= 1); + ASSERT (divisor != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); + ASSERT_MPN (src, size); + ASSERT_LIMB (divisor); + + s_next = *src++; /* src[0] */ + + rshift = 0; + lshift_mask = 0; + if ((divisor & 1) == 0) + { + count_trailing_zeros (rshift, divisor); + lshift_mask = MP_LIMB_T_MAX; + divisor >>= rshift; + } + + binvert_limb (inverse, divisor); + lshift = 64 - rshift; + + c = 0; + h = 0; + sr = s_next >> rshift; + + size--; + if (LIKELY (size != 0)) + { + do + { + s_next = *src++; /* src[i+1] */ + s = sr | ((s_next << lshift) & lshift_mask); + x = s - c; + c = s < c; + sr = s_next >> rshift; + + y = x - h; + c += (x < h); + q = y * inverse; + *dst++ = q; + umul_ppmm (h, dummy, q, divisor); + + size--; + } + while (size != 0); + } + + x = sr - c; + y = x - h; + q = y * inverse; + *dst = q; /* dst[size-1] */ +} diff --git a/gmp-6.3.0/mpn/alpha/divrem_2.asm b/gmp-6.3.0/mpn/alpha/divrem_2.asm new file mode 100644 index 0000000..046b246 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/divrem_2.asm @@ -0,0 +1,177 @@ +dnl Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C norm frac +C ev4 +C ev5 70 70 +C ev6 29 29 + +C TODO +C * Perhaps inline mpn_invert_limb, that would allow us to not save/restore +C any registers (thus save ~10 cycles per call). +C * Use negated d1 and/or d0 to speed carry propagation. Might save a cycle +C or two. +C * Check cluster delays (for ev6). We very likely could save some cycles. +C * Use branch-free code for computing di. +C * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call. + +C INPUT PARAMETERS +define(`qp', `r16') +define(`fn', `r17') +define(`up_param', `r18') +define(`un_param', `r19') +define(`dp', `r20') + +ASM_START() +PROLOGUE(mpn_divrem_2,gp) + lda r30, -80(r30) + stq r26, 0(r30) + stq r9, 8(r30) + stq r10, 16(r30) + stq r11, 24(r30) + stq r12, 32(r30) + stq r13, 40(r30) +C stq r14, 48(r30) + stq r15, 56(r30) + .prologue 1 + stq r16, 64(r30) + bis r31, r17, r15 + s8addq r19, r18, r13 + lda r13, -24(r13) + ldq r12, 8(r20) + ldq r10, 0(r20) + ldq r11, 16(r13) + ldq r9, 8(r13) + + bis r31, r31, r3 C most_significant_q_limb = 0 + cmpult r11, r12, r1 + bne r1, L(L8) + cmpule r11, r12, r1 + cmpult r9, r10, r2 + and r1, r2, r1 + bne r1, L(L8) + subq r11, r12, r11 + subq r11, r2, r11 + subq r9, r10, r9 + lda r3, 1(r31) C most_significant_q_limb = 1 +L(L8): stq r3, 72(r30) + + addq r15, r19, r19 + lda r19, -3(r19) + blt r19, L(L10) + bis r31, r12, r16 + jsr r26, mpn_invert_limb + LDGP( r29, 0(r26)) + mulq r0, r12, r4 C t0 = LO(di * d1) + umulh r0, r10, r2 C s1 = HI(di * d0) + addq r4, r10, r4 C t0 += d0 + cmpule r10, r4, r7 C (t0 < d0) + addq r4, r2, r4 C t0 += s1 + cmpult r4, r2, r1 + subq r1, r7, r7 C t1 (-1, 0, or 1) + blt r7, L(L42) +L(L22): + lda r0, -1(r0) C di-- + cmpult r4, r12, r1 C cy for: t0 -= d1 (below) + subq r7, r1, r7 C t1 -= cy + subq r4, r12, r4 C t0 -= d1 + bge r7, L(L22) +L(L42): + ldq r16, 64(r30) + s8addq r19, r16, r16 + ALIGN(16) +L(loop): + mulq r11, r0, r5 C q0 (early) + umulh r11, r0, r6 C q (early) + addq r5, r9, r8 C q0 += n1 + addq r6, r11, r6 C q += n2 + cmpult r8, r5, r1 C cy for: q0 += n1 + addq r6, r1, r6 C q += cy + unop + mulq r12, r6, r1 C LO(d1 * q) + umulh r10, r6, r7 C t1 = HI(d0 * q) + subq r9, r1, r9 C n1 -= LO(d1 * q) + mulq r10, r6, r4 C t0 = LO(d0 * q) + unop + cmple r15, r19, r5 C condition and n0... + beq r5, L(L31) + ldq r5, 0(r13) + lda r13, -8(r13) +L(L31): subq r9, r12, r9 C n1 -= d1 + cmpult r5, r10, r1 C + subq r9, r1, r9 C + subq r5, r10, r5 C n0 -= d0 + subq r9, r7, r9 C n1 -= t0 + cmpult r5, r4, r1 C + subq r9, r1, r2 C + subq r5, r4, r5 C n0 -= t1 + cmpult r2, r8, r1 C (n1 < q0) + addq r6, r1, r6 C q += cond + lda r1, -1(r1) C -(n1 >= q0) + and r1, r10, r4 C + addq r5, r4, r9 C n0 += mask & d0 + and r1, r12, r1 C + cmpult r9, r5, r11 C cy for: n0 += mask & d0 + addq r2, r1, r1 C n1 += mask & d1 + addq r1, r11, r11 C n1 += cy + cmpult r11, r12, r1 C + beq r1, L(fix) C +L(bck): stq r6, 0(r16) + lda r16, -8(r16) + lda r19, -1(r19) + bge r19, L(loop) + +L(L10): stq r9, 8(r13) + stq r11, 16(r13) + ldq r0, 72(r30) + ldq r26, 0(r30) + ldq r9, 8(r30) + ldq r10, 16(r30) + ldq r11, 24(r30) + ldq r12, 32(r30) + ldq r13, 40(r30) +C ldq r14, 48(r30) + ldq r15, 56(r30) + lda r30, 80(r30) + ret r31, (r26), 1 + +L(fix): cmpule r11, r12, r1 + cmpult r9, r10, r2 + and r1, r2, r1 + bne r1, L(bck) + subq r11, r12, r11 + subq r11, r2, r11 + subq r9, r10, r9 + lda r6, 1(r6) + br L(bck) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev5/diveby3.asm b/gmp-6.3.0/mpn/alpha/ev5/diveby3.asm new file mode 100644 index 0000000..3758188 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev5/diveby3.asm @@ -0,0 +1,332 @@ +dnl Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder. + +dnl Copyright 2004, 2005, 2009 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 22 +C EV5: 11.5 +C EV6: 6.3 Note that mpn_bdiv_dbm1c is faster + +C TODO +C * Remove the unops, they benefit just ev6, which no longer uses this file. +C * Try prefetch for destination, using lds. +C * Improve feed-in code, by moving initial mulq earlier; make initial load +C to u0/u0 to save some copying. +C * Combine u0 and u2, u1 and u3. + +C INPUT PARAMETERS +define(`rp', `r16') +define(`up', `r17') +define(`n', `r18') +define(`cy', `r19') + +ASM_START() + +DATASTART(L(LC),8) + .quad 0xAAAAAAAAAAAAAAAB + .quad 0x5555555555555555 + .quad 0xAAAAAAAAAAAAAAAA +DATAEND() + +define(`xAAAAAAAAAAAAAAAB', `r20') +define(`x5555555555555555', `r21') +define(`xAAAAAAAAAAAAAAAA', `r22') +define(`u0', `r0') define(`u1', `r1') +define(`u2', `r2') define(`u3', `r3') +define(`l0', `r25') define(`x', `r8') +define(`q0', `r4') define(`q1', `r5') +define(`p6', `r6') define(`p7', `r7') +define(`t0', `r23') define(`t1', `r24') +define(`cymask',`r28') + + +PROLOGUE(mpn_divexact_by3c,gp) + + ldq r28, 0(up) C load first limb early + +C Put magic constants in registers + lda r0, L(LC) + ldq xAAAAAAAAAAAAAAAB, 0(r0) + ldq x5555555555555555, 8(r0) + ldq xAAAAAAAAAAAAAAAA, 16(r0) + +C Compute initial l0 value + cmpeq cy, 1, p6 + cmpeq cy, 2, p7 + negq p6, p6 + and p6, x5555555555555555, l0 + cmovne p7, xAAAAAAAAAAAAAAAA, l0 + +C Feed-in depending on (n mod 4) + and n, 3, r8 + lda n, -3(n) + cmpeq r8, 1, r4 + cmpeq r8, 2, r5 + bne r4, $Lb01 + bne r5, $Lb10 + beq r8, $Lb00 + +$Lb11: ldq u3, 8(up) + lda up, -24(up) + lda rp, -24(rp) + mulq r28, xAAAAAAAAAAAAAAAB, q0 + mov r28, u2 + br r31, $L11 + +$Lb00: ldq u2, 8(up) + lda up, -16(up) + lda rp, -16(rp) + mulq r28, xAAAAAAAAAAAAAAAB, q1 + mov r28, u1 + br r31, $L00 + +$Lb01: lda rp, -8(rp) + mulq r28, xAAAAAAAAAAAAAAAB, q0 + mov r28, u0 + blt n, $Lcj1 + ldq u1, 8(up) + lda up, -8(up) + br r31, $L01 + +$Lb10: ldq u0, 8(up) + mulq r28, xAAAAAAAAAAAAAAAB, q1 + mov r28, u3 + blt n, $Lend + + ALIGN(16) +$Ltop: +C 0 + cmpult u3, cy, cy C L0 + mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1 + ldq u1, 16(up) C L1 + addq q1, l0, x C U0 +C 1 + negq cy, cymask C L0 + unop C U1 + unop C L1 + cmpult x5555555555555555, x, p6 C U0 +C 2 + cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 + unop + unop + negq p6, t0 C L0 +C 3 + negq p7, t1 C L0 + and cymask, x5555555555555555, l0 C U1 + addq p6, cy, cy + and t0, x5555555555555555, t0 +C 4 + and t1, x5555555555555555, t1 + addq p7, cy, cy + unop + addq t0, l0, l0 +C 5 + addq t1, l0, l0 + unop + stq x, 0(rp) C L1 + unop +$L01: +C 0 + cmpult u0, cy, cy C L0 + mulq u1, xAAAAAAAAAAAAAAAB, q1 C U1 + ldq u2, 24(up) C L1 + addq q0, l0, x C U0 +C 1 + negq cy, cymask C L0 + unop C U1 + unop C L1 + cmpult x5555555555555555, x, p6 C U0 +C 2 + cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 + unop + unop + negq p6, t0 C L0 +C 3 + negq p7, t1 C L0 + and cymask, x5555555555555555, l0 C U1 + addq p6, cy, cy + and t0, x5555555555555555, t0 +C 4 + and t1, x5555555555555555, t1 + addq p7, cy, cy + unop + addq t0, l0, l0 +C 5 + addq t1, l0, l0 + unop + stq x, 8(rp) C L1 + unop +$L00: +C 0 + cmpult u1, cy, cy C L0 + mulq u2, xAAAAAAAAAAAAAAAB, q0 C U1 + ldq u3, 32(up) C L1 + addq q1, l0, x C U0 +C 1 + negq cy, cymask C L0 + unop C U1 + unop C L1 + cmpult x5555555555555555, x, p6 C U0 +C 2 + cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 + unop + unop + negq p6, t0 C L0 +C 3 + negq p7, t1 C L0 + and cymask, x5555555555555555, l0 C U1 + addq p6, cy, cy + and t0, x5555555555555555, t0 +C 4 + and t1, x5555555555555555, t1 + addq p7, cy, cy + unop + addq t0, l0, l0 +C 5 + addq t1, l0, l0 + unop + stq x, 16(rp) C L1 + unop +$L11: +C 0 + cmpult u2, cy, cy C L0 + mulq u3, xAAAAAAAAAAAAAAAB, q1 C U1 + ldq u0, 40(up) C L1 + addq q0, l0, x C U0 +C 1 + negq cy, cymask C L0 + unop C U1 + unop C L1 + cmpult x5555555555555555, x, p6 C U0 +C 2 + cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 + lda n, -4(n) C L1 bookkeeping + unop + negq p6, t0 C L0 +C 3 + negq p7, t1 C L0 + and cymask, x5555555555555555, l0 C U1 + addq p6, cy, cy + and t0, x5555555555555555, t0 +C 4 + and t1, x5555555555555555, t1 + addq p7, cy, cy + unop + addq t0, l0, l0 +C 5 + addq t1, l0, l0 + unop + stq x, 24(rp) C L1 + lda up, 32(up) +C + ldl r31, 256(up) C prefetch + unop + lda rp, 32(rp) + bge n, $Ltop C U1 +C *** MAIN LOOP END *** +$Lend: + + cmpult u3, cy, cy C L0 + mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1 + unop + addq q1, l0, x C U0 +C 1 + negq cy, cymask C L0 + unop C U1 + unop C L1 + cmpult x5555555555555555, x, p6 C U0 +C 2 + cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 + unop + unop + negq p6, t0 C L0 +C 3 + negq p7, t1 C L0 + and cymask, x5555555555555555, l0 C U1 + addq p6, cy, cy + and t0, x5555555555555555, t0 +C 4 + and t1, x5555555555555555, t1 + addq p7, cy, cy + unop + addq t0, l0, l0 +C 5 + addq t1, l0, l0 + unop + stq x, 0(rp) C L1 + unop +$Lcj1: + cmpult u0, cy, cy C L0 + addq q0, l0, x C U0 + cmpult x5555555555555555, x, p6 C U0 + cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 + addq p6, cy, cy + addq p7, cy, r0 + stq x, 8(rp) C L1 + + ret r31,(r26),1 +EPILOGUE() +ASM_END() + +C This is useful for playing with various schedules. +C Expand as: one(0)one(1)one(2)one(3) +define(`one',` +C 0 + cmpult `$'eval(($1+3)%4), cy, cy C L0 + mulq `$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1 + ldq `$'eval(($1+1)%4), eval($1*8+16)(up) C L1 + addq `$'eval(4+($1+1)%2), l0, x C U0 +C 1 + negq cy, cymask C L0 + unop C U1 + unop C L1 + cmpult x5555555555555555, x, p6 C U0 +C 2 + cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 + unop + unop + negq p6, t0 C L0 +C 3 + negq p7, t1 C L0 + and cymask, x5555555555555555, l0 C U1 + addq p6, cy, cy + and t0, x5555555555555555, t0 +C 4 + and t1, x5555555555555555, t1 + addq p7, cy, cy + unop + addq t0, l0, l0 +C 5 + addq t1, l0, l0 + unop + stq x, eval($1*8)(rp) C L1 + unop +') diff --git a/gmp-6.3.0/mpn/alpha/ev5/gmp-mparam.h b/gmp-6.3.0/mpn/alpha/ev5/gmp-mparam.h new file mode 100644 index 0000000..1575a28 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev5/gmp-mparam.h @@ -0,0 +1,191 @@ +/* Alpha EV5 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 600 MHz 21164A */ +/* FFT tuning limit = 5000000 */ +/* Generated by tuneup.c, 2017-02-02, gcc 4.9 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 22 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 20 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 69 + +#define DIV_1_VS_MUL_1_PERCENT 181 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 50 +#define MUL_TOOM44_THRESHOLD 118 +#define MUL_TOOM6H_THRESHOLD 173 +#define MUL_TOOM8H_THRESHOLD 236 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 84 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 53 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 69 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 189 +#define SQR_TOOM8_THRESHOLD 357 + +#define MULMID_TOOM42_THRESHOLD 18 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define MUL_FFT_MODF_THRESHOLD 284 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 284, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 63, 8}, \ + { 255, 7}, { 511,10}, { 71, 9}, { 143, 8}, \ + { 287, 7}, { 575, 9}, { 159, 8}, { 319,11}, \ + { 47,12}, { 31,11}, { 63, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287,11}, { 79,10}, \ + { 159, 9}, { 319,10}, { 175, 9}, { 351, 8}, \ + { 703,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415,12}, { 63,10}, { 255,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 175,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,11}, { 223,13}, \ + { 63,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,12}, { 191,11}, \ + { 415,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,12}, { 287,11}, { 575,12}, { 351,13}, \ + { 191,12}, { 479,13}, { 255,12}, { 575,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,14}, { 255,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 121 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 14, 5}, { 29, 7}, { 9, 6}, { 19, 7}, \ + { 13, 6}, { 27, 8}, { 7, 7}, { 21, 8}, \ + { 11, 7}, { 29, 8}, { 19, 9}, { 11, 8}, \ + { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \ + { 287,10}, { 79,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \ + { 287,11}, { 79,10}, { 159, 9}, { 319,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207, 9}, { 415,11}, { 111,10}, { 223,12}, \ + { 63,11}, { 175,12}, { 95,11}, { 207,13}, \ + { 63,12}, { 127,11}, { 287,12}, { 159,11}, \ + { 351,12}, { 191,11}, { 415,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 351,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 831,13}, { 447,14}, { 255,13}, \ + { 511,12}, { 1023,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 105 +#define SQR_FFT_THRESHOLD 3968 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 50 +#define MULLO_MUL_N_THRESHOLD 5558 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 78 +#define SQRLO_SQR_THRESHOLD 3597 + +#define DC_DIV_QR_THRESHOLD 47 +#define DC_DIVAPPR_Q_THRESHOLD 167 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 181 +#define INV_APPR_THRESHOLD 173 + +#define BINV_NEWTON_THRESHOLD 182 +#define REDC_1_TO_REDC_N_THRESHOLD 47 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 90 +#define MU_BDIV_QR_THRESHOLD 748 +#define MU_BDIV_Q_THRESHOLD 979 + +#define POWM_SEC_TABLE 1,16,90,386,2177 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 363 +#define SET_STR_PRECOMPUTE_THRESHOLD 1201 + +#define FAC_DSC_THRESHOLD 342 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 105 +#define HGCD_APPR_THRESHOLD 108 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 238 +#define GCDEXT_DC_THRESHOLD 199 +#define JACOBI_BASE_METHOD 2 diff --git a/gmp-6.3.0/mpn/alpha/ev6/add_n.asm b/gmp-6.3.0/mpn/alpha/ev6/add_n.asm new file mode 100644 index 0000000..9261f31 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/add_n.asm @@ -0,0 +1,283 @@ +dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 5.4 +C EV6: 2.125 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C vp r18 +C n r19 +C cy r20 (for mpn_add_nc) + +C TODO +C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) +C Use multi-pronged feed-in. +C Perform additional micro-tuning + +C This code was written in cooperation with ev6 pipeline expert Steve Root. + +C Pair loads and stores where possible +C Store pairs oct-aligned where possible (didn't need it here) +C Stores are delayed every third cycle +C Loads and stores are delayed by fills +C U stays still, put code there where possible (note alternation of U1 and U0) +C L moves because of loads and stores +C Note dampers in L to limit damage + +C This odd-looking optimization expects that were having random bits in our +C data, so that a pure zero result is unlikely. so we penalize the unlikely +C case to help the common case. + +define(`u0', `r0') define(`u1', `r3') +define(`v0', `r1') define(`v1', `r4') + +define(`cy0', `r20') define(`cy1', `r21') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc) + +ASM_START() +PROLOGUE(mpn_add_nc) + br r31, $entry +EPILOGUE() +PROLOGUE(mpn_add_n) + bis r31, r31, cy0 C clear carry in +$entry: cmpult r19, 5, r22 C L1 move counter + ldq u1, 0(r17) C L0 get next ones + ldq v1, 0(r18) C L1 + bne r22, $Lsmall + + ldq u0, 8(r17) C L0 get next ones + ldq v0, 8(r18) C L1 + addq u1, v1, r5 C U0 add two data + + cmpult r5, v1, r23 C U0 did it carry + ldq u1, 16(r17) C L0 get next ones + ldq v1, 16(r18) C L1 + + addq u0, v0, r8 C U1 add two data + addq r5, cy0, r5 C U0 carry in + + cmpult r8, v0, r22 C U1 did it carry + beq r5, $fix5f C U0 fix exact zero +$ret5f: ldq u0, 24(r17) C L0 get next ones + ldq v0, 24(r18) C L1 + + addq r8, r23, r8 C U1 carry from last + addq u1, v1, r7 C U0 add two data + + beq r8, $fix6f C U1 fix exact zero +$ret6f: cmpult r7, v1, r23 C U0 did it carry + ldq u1, 32(r17) C L0 get next ones + ldq v1, 32(r18) C L1 + + lda r17, 40(r17) C L0 move pointer + lda r18, 40(r18) C L1 move pointer + + lda r16, -8(r16) + lda r19, -13(r19) C L1 move counter + blt r19, $Lend C U1 loop control + + +C Main loop. 8-way unrolled. + ALIGN(16) +$Loop: addq u0, v0, r2 C U1 add two data + addq r7, r22, r7 C U0 add in carry + stq r5, 8(r16) C L0 put an answer + stq r8, 16(r16) C L1 pair + + cmpult r2, v0, cy1 C U1 did it carry + beq r7, $fix7 C U0 fix exact 0 +$ret7: ldq u0, 0(r17) C L0 get next ones + ldq v0, 0(r18) C L1 + + bis r31, r31, r31 C L damp out + addq r2, r23, r2 C U1 carry from last + bis r31, r31, r31 C L moves in L ! + addq u1, v1, r5 C U0 add two data + + beq r2, $fix0 C U1 fix exact zero +$ret0: cmpult r5, v1, cy0 C U0 did it carry + ldq u1, 8(r17) C L0 get next ones + ldq v1, 8(r18) C L1 + + addq u0, v0, r8 C U1 add two data + addq r5, cy1, r5 C U0 carry from last + stq r7, 24(r16) C L0 store pair + stq r2, 32(r16) C L1 + + cmpult r8, v0, r22 C U1 did it carry + beq r5, $fix1 C U0 fix exact zero +$ret1: ldq u0, 16(r17) C L0 get next ones + ldq v0, 16(r18) C L1 + + lda r16, 64(r16) C L0 move pointer + addq r8, cy0, r8 C U1 carry from last + lda r19, -8(r19) C L1 move counter + addq u1, v1, r7 C U0 add two data + + beq r8, $fix2 C U1 fix exact zero +$ret2: cmpult r7, v1, r23 C U0 did it carry + ldq u1, 24(r17) C L0 get next ones + ldq v1, 24(r18) C L1 + + addq u0, v0, r2 C U1 add two data + addq r7, r22, r7 C U0 add in carry + stq r5, -24(r16) C L0 put an answer + stq r8, -16(r16) C L1 pair + + cmpult r2, v0, cy1 C U1 did it carry + beq r7, $fix3 C U0 fix exact 0 +$ret3: ldq u0, 32(r17) C L0 get next ones + ldq v0, 32(r18) C L1 + + bis r31, r31, r31 C L damp out + addq r2, r23, r2 C U1 carry from last + bis r31, r31, r31 C L moves in L ! + addq u1, v1, r5 C U0 add two data + + beq r2, $fix4 C U1 fix exact zero +$ret4: cmpult r5, v1, cy0 C U0 did it carry + ldq u1, 40(r17) C L0 get next ones + ldq v1, 40(r18) C L1 + + addq u0, v0, r8 C U1 add two data + addq r5, cy1, r5 C U0 carry from last + stq r7, -8(r16) C L0 store pair + stq r2, 0(r16) C L1 + + cmpult r8, v0, r22 C U1 did it carry + beq r5, $fix5 C U0 fix exact zero +$ret5: ldq u0, 48(r17) C L0 get next ones + ldq v0, 48(r18) C L1 + + ldl r31, 256(r17) C L0 prefetch + addq r8, cy0, r8 C U1 carry from last + ldl r31, 256(r18) C L1 prefetch + addq u1, v1, r7 C U0 add two data + + beq r8, $fix6 C U1 fix exact zero +$ret6: cmpult r7, v1, r23 C U0 did it carry + ldq u1, 56(r17) C L0 get next ones + ldq v1, 56(r18) C L1 + + lda r17, 64(r17) C L0 move pointer + bis r31, r31, r31 C U + lda r18, 64(r18) C L1 move pointer + bge r19, $Loop C U1 loop control +C ==== main loop end + +$Lend: addq u0, v0, r2 C U1 add two data + addq r7, r22, r7 C U0 add in carry + stq r5, 8(r16) C L0 put an answer + stq r8, 16(r16) C L1 pair + cmpult r2, v0, cy1 C U1 did it carry + beq r7, $fix7c C U0 fix exact 0 +$ret7c: addq r2, r23, r2 C U1 carry from last + addq u1, v1, r5 C U0 add two data + beq r2, $fix0c C U1 fix exact zero +$ret0c: cmpult r5, v1, cy0 C U0 did it carry + addq r5, cy1, r5 C U0 carry from last + stq r7, 24(r16) C L0 store pair + stq r2, 32(r16) C L1 + beq r5, $fix1c C U0 fix exact zero +$ret1c: stq r5, 40(r16) C L0 put an answer + lda r16, 48(r16) C L0 move pointer + + lda r19, 8(r19) + beq r19, $Lret + + ldq u1, 0(r17) + ldq v1, 0(r18) +$Lsmall: + lda r19, -1(r19) + beq r19, $Lend0 + + ALIGN(8) +$Loop0: addq u1, v1, r2 C main add + cmpult r2, v1, r8 C compute cy from last add + ldq u1, 8(r17) + ldq v1, 8(r18) + addq r2, cy0, r5 C carry add + lda r17, 8(r17) + lda r18, 8(r18) + stq r5, 0(r16) + cmpult r5, r2, cy0 C compute cy from last add + lda r19, -1(r19) C decr loop cnt + bis r8, cy0, cy0 C combine cy from the two adds + lda r16, 8(r16) + bne r19, $Loop0 +$Lend0: addq u1, v1, r2 C main add + addq r2, cy0, r5 C carry add + cmpult r2, v1, r8 C compute cy from last add + cmpult r5, r2, cy0 C compute cy from last add + stq r5, 0(r16) + bis r8, cy0, r0 C combine cy from the two adds + ret r31,(r26),1 + + ALIGN(8) +$Lret: lda r0, 0(cy0) C copy carry into return register + ret r31,(r26),1 + +$fix5f: bis r23, cy0, r23 C bring forward carry + br r31, $ret5f +$fix6f: bis r22, r23, r22 C bring forward carry + br r31, $ret6f +$fix0: bis cy1, r23, cy1 C bring forward carry + br r31, $ret0 +$fix1: bis cy0, cy1, cy0 C bring forward carry + br r31, $ret1 +$fix2: bis r22, cy0, r22 C bring forward carry + br r31, $ret2 +$fix3: bis r23, r22, r23 C bring forward carry + br r31, $ret3 +$fix4: bis cy1, r23, cy1 C bring forward carry + br r31, $ret4 +$fix5: bis cy1, cy0, cy0 C bring forward carry + br r31, $ret5 +$fix6: bis r22, cy0, r22 C bring forward carry + br r31, $ret6 +$fix7: bis r23, r22, r23 C bring forward carry + br r31, $ret7 +$fix0c: bis cy1, r23, cy1 C bring forward carry + br r31, $ret0c +$fix1c: bis cy0, cy1, cy0 C bring forward carry + br r31, $ret1c +$fix7c: bis r23, r22, r23 C bring forward carry + br r31, $ret7c + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm b/gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm new file mode 100644 index 0000000..cb966ce --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/aorslsh1_n.asm @@ -0,0 +1,172 @@ +dnl Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1). + +dnl Copyright 2003, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 7 +C EV6: 4 + +C TODO +C * Tune to reach 3.75 c/l on ev6. + +define(`rp',`r16') +define(`up',`r17') +define(`vp',`r18') +define(`n', `r19') + +define(`u0', `r8') +define(`u1', `r1') +define(`v0', `r4') +define(`v1', `r5') + +define(`cy0', `r0') +define(`cy1', `r20') +define(`cy', `r22') +define(`rr', `r24') +define(`ps', `r25') +define(`sl', `r28') + +ifdef(`OPERATION_addlsh1_n',` + define(ADDSUB, addq) + define(CARRY, `cmpult $1,$2,$3') + define(func, mpn_addlsh1_n) +') +ifdef(`OPERATION_sublsh1_n',` + define(ADDSUB, subq) + define(CARRY, `cmpult $2,$1,$3') + define(func, mpn_sublsh1_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + +ASM_START() +PROLOGUE(func) + and n, 2, cy0 + blbs n, L(bx1) +L(bx0): ldq v1, 0(vp) + ldq u1, 0(up) + lda r2, 0(r31) + bne cy0, L(b10) + +L(b00): lda vp, 48(vp) + lda up, -16(up) + lda rp, -8(rp) + lda cy0, 0(r31) + br r31, L(lo0) + +L(b10): lda vp, 32(vp) + lda rp, 8(rp) + lda cy0, 0(r31) + br r31, L(lo2) + +L(bx1): ldq v0, 0(vp) + ldq u0, 0(up) + lda r3, 0(r31) + beq cy0, L(b01) + +L(b11): lda vp, 40(vp) + lda up, -24(up) + lda rp, 16(rp) + lda cy1, 0(r31) + br r31, L(lo3) + +L(b01): lda n, -4(n) + lda cy1, 0(r31) + ble n, L(end) + lda vp, 24(vp) + lda up, -8(up) + + ALIGN(16) +L(top): addq v0, v0, r6 + ldq v1, -16(vp) + addq r6, r3, sl C combined vlimb + ldq u1, 16(up) + ADDSUB u0, sl, ps C ulimb + (vlimb << 1) + cmplt v0, r31, r2 C high v bits + ADDSUB ps, cy1, rr C consume carry from previous operation + CARRY( ps, u0, cy0) C carry out #2 + stq rr, 0(rp) + CARRY( rr, ps, cy) C carry out #3 + lda vp, 32(vp) C bookkeeping + addq cy, cy0, cy0 C final carry out +L(lo0): addq v1, v1, r7 + ldq v0, -40(vp) + addq r7, r2, sl + ldq u0, 24(up) + ADDSUB u1, sl, ps + cmplt v1, r31, r3 + ADDSUB ps, cy0, rr + CARRY( ps, u1, cy1) + stq rr, 8(rp) + CARRY( rr, ps, cy) + lda rp, 32(rp) C bookkeeping + addq cy, cy1, cy1 +L(lo3): addq v0, v0, r6 + ldq v1, -32(vp) + addq r6, r3, sl + ldq u1, 32(up) + ADDSUB u0, sl, ps + cmplt v0, r31, r2 + ADDSUB ps, cy1, rr + CARRY( ps, u0, cy0) + stq rr, -16(rp) + CARRY( rr, ps, cy) + lda up, 32(up) C bookkeeping + addq cy, cy0, cy0 +L(lo2): addq v1, v1, r7 + ldq v0, -24(vp) + addq r7, r2, sl + ldq u0, 8(up) + ADDSUB u1, sl, ps + cmplt v1, r31, r3 + ADDSUB ps, cy0, rr + CARRY( ps, u1, cy1) + stq rr, -8(rp) + CARRY( rr, ps, cy) + lda n, -4(n) C bookkeeping + addq cy, cy1, cy1 + bgt n, L(top) + +L(end): addq v0, v0, r6 + addq r6, r3, sl + ADDSUB u0, sl, ps + cmplt v0, r31, r2 + ADDSUB ps, cy1, rr + CARRY( ps, u0, cy0) + stq rr, 0(rp) + CARRY( rr, ps, cy) + addq cy, cy0, cy0 + addq cy0, r2, r0 + + ret r31,(r26),1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm new file mode 100644 index 0000000..0e68e6e --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/aorsmul_1.asm @@ -0,0 +1,398 @@ +dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 3.5 + +C INPUT PARAMETERS +define(`rp', `r16') +define(`up', `r17') +define(`n', `r18') +define(`v0', `r19') + +dnl This code was written in cooperation with ev6 pipeline expert Steve Root. + +dnl The stores can issue a cycle late so we have paired no-op's to 'catch' +dnl them, so that further disturbance to the schedule is damped. + +dnl We couldn't pair the loads, because the entangled schedule of the carry's +dnl has to happen on one side {0} of the machine. + +dnl This is a great schedule for the d_cache, a poor schedule for the b_cache. +dnl The lockup on U0 means that any stall can't be recovered from. Consider a +dnl ldq in L1, say that load gets stalled because it collides with a fill from +dnl the b_cache. On the next cycle, this load gets priority. If first looks +dnl at L0, and goes there. The instruction we intended for L0 gets to look at +dnl L1, which is NOT where we want it. It either stalls 1, because it can't +dnl go in L0, or goes there, and causes a further instruction to stall. + +dnl So for b_cache, we're likely going to want to put one or more cycles back +dnl into the code! And, of course, put in lds prefetch for the rp[] operand. +dnl At a place where we have an mt followed by a bookkeeping, put the +dnl bookkeeping in upper, and the prefetch into lower. + +dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd +dnl like not to have an ldq or an stq to preceded a conditional branch in a +dnl quadpack. The conditional branch moves the retire pointer one cycle +dnl later. + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `addq') + define(`CMPCY', `cmpult $2,$1') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `subq') + define(`CMPCY', `cmpult $1,$2') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + ldq r3, 0(up) C + and r18, 7, r20 C + lda r18, -9(r18) C + cmpeq r20, 1, r21 C + beq r21, $L1 C + +$1mod8: ldq r5, 0(rp) C + mulq v0, r3, r7 C + umulh v0, r3, r8 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r8, r20, r0 C + stq r23, 0(rp) C + bge r18, $ent1 C + ret r31, (r26), 1 C + +$L1: lda r8, 0(r31) C zero carry reg + lda r24, 0(r31) C zero carry reg + cmpeq r20, 2, r21 C + bne r21, $2mod8 C + cmpeq r20, 3, r21 C + bne r21, $3mod8 C + cmpeq r20, 4, r21 C + bne r21, $4mod8 C + cmpeq r20, 5, r21 C + bne r21, $5mod8 C + cmpeq r20, 6, r21 C + bne r21, $6mod8 C + cmpeq r20, 7, r21 C + beq r21, $0mod8 C + +$7mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r24 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r24, r20, r24 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$6mod8: ldq r1, 8(up) C + mulq v0, r3, r25 C + umulh v0, r3, r3 C + mulq v0, r1, r28 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + ldq r1, 24(up) C + lda up, 48(up) C L1 bookkeeping + mulq v0, r0, r2 C + ldq r5, 8(rp) C + lda rp, -32(rp) C L1 bookkeeping + umulh v0, r0, r6 C + ADDSUB r4, r25, r25 C lo + acc + mulq v0, r1, r7 C + br r31, $ent6 C + +$ent1: lda up, 8(up) C + lda rp, 8(rp) C + lda r8, 0(r0) C + ldq r3, 0(up) C +$0mod8: ldq r1, 8(up) C + mulq v0, r3, r2 C + umulh v0, r3, r6 C + mulq v0, r1, r7 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r24 C + ldq r1, 24(up) C + mulq v0, r0, r25 C + ldq r5, 8(rp) C + umulh v0, r0, r3 C + ADDSUB r4, r2, r2 C lo + acc + mulq v0, r1, r28 C + lda rp, -16(rp) C + br r31, $ent0 C + +$3mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r8 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r8, r20, r24 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$2mod8: ldq r1, 8(up) C + mulq v0, r3, r25 C + umulh v0, r3, r3 C + mulq v0, r1, r28 C + ble r18, $n23 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + ldq r1, 24(up) C + lda up, 16(up) C L1 bookkeeping + mulq v0, r0, r2 C + ldq r5, 8(rp) C + lda rp, 0(rp) C L1 bookkeeping + umulh v0, r0, r6 C + ADDSUB r4, r25, r25 C lo + acc + mulq v0, r1, r7 C + br r31, $ent2 C + +$5mod8: ldq r5, 0(rp) C + lda up, 8(up) C + mulq v0, r3, r7 C + umulh v0, r3, r24 C + ADDSUB r5, r7, r23 C + CMPCY( r5, r23), r20 C + addq r24, r20, r8 C + stq r23, 0(rp) C + lda rp, 8(rp) C + ldq r3, 0(up) C +$4mod8: ldq r1, 8(up) C + mulq v0, r3, r2 C + umulh v0, r3, r6 C + mulq v0, r1, r7 C + ldq r0, 16(up) C + ldq r4, 0(rp) C + umulh v0, r1, r24 C + ldq r1, 24(up) C + lda up, 32(up) C L1 bookkeeping + mulq v0, r0, r25 C + ldq r5, 8(rp) C + lda rp, 16(rp) C L1 bookkeeping + umulh v0, r0, r3 C + ADDSUB r4, r2, r2 C lo + acc + mulq v0, r1, r28 C + CMPCY( r4, r2), r20 C L0 lo add => carry + ADDSUB r2, r8, r22 C U0 hi add => answer + ble r18, $Lend C + ALIGN(16) +$Loop: + bis r31, r31, r31 C U1 mt + CMPCY( r2, r22), r21 C L0 hi add => carry + addq r6, r20, r6 C U0 hi mul + carry + ldq r0, 0(up) C + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r7, r7 C L0 lo + acc + addq r6, r21, r6 C U0 hi mul + carry + ldq r4, 0(rp) C L1 + + umulh v0, r1, r8 C U1 + CMPCY( r5, r7), r20 C L0 lo add => carry + ADDSUB r7, r6, r23 C U0 hi add => answer + ldq r1, 8(up) C L1 + + mulq v0, r0, r2 C U1 + CMPCY( r7, r23), r21 C L0 hi add => carry + addq r24, r20, r24 C U0 hi mul + carry + ldq r5, 8(rp) C L1 + + umulh v0, r0, r6 C U1 + ADDSUB r4, r25, r25 C U0 lo + acc + stq r22, -16(rp) C L0 + stq r23, -8(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r7 C U1 + bis r31, r31, r31 C L1 st slosh + addq r24, r21, r24 C U0 hi mul + carry +$ent2: + CMPCY( r4, r25), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r18, -8(r18) C L1 bookkeeping + ADDSUB r25, r24, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r25, r22), r21 C L0 hi add => carry + addq r3, r20, r3 C U0 hi mul + carry + ldq r0, 16(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r28, r28 C L0 lo + acc + addq r3, r21, r3 C U0 hi mul + carry + ldq r4, 16(rp) C L1 + + umulh v0, r1, r24 C U1 + CMPCY( r5, r28), r20 C L0 lo add => carry + ADDSUB r28, r3, r23 C U0 hi add => answer + ldq r1, 24(up) C L1 + + mulq v0, r0, r25 C U1 + CMPCY( r28, r23), r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r5, 24(rp) C L1 + + umulh v0, r0, r3 C U1 + ADDSUB r4, r2, r2 C U0 lo + acc + stq r22, 0(rp) C L0 + stq r23, 8(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r28 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry +$ent0: + CMPCY( r4, r2), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda up, 64(up) C L1 bookkeeping + ADDSUB r2, r8, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r2, r22), r21 C L0 hi add => carry + addq r6, r20, r6 C U0 hi mul + carry + ldq r0, -32(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r7, r7 C L0 lo + acc + addq r6, r21, r6 C U0 hi mul + carry + ldq r4, 32(rp) C L1 + + umulh v0, r1, r8 C U1 + CMPCY( r5, r7), r20 C L0 lo add => carry + ADDSUB r7, r6, r23 C U0 hi add => answer + ldq r1, -24(up) C L1 + + mulq v0, r0, r2 C U1 + CMPCY( r7, r23), r21 C L0 hi add => carry + addq r24, r20, r24 C U0 hi mul + carry + ldq r5, 40(rp) C L1 + + umulh v0, r0, r6 C U1 + ADDSUB r4, r25, r25 C U0 lo + acc + stq r22, 16(rp) C L0 + stq r23, 24(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r7 C U1 + bis r31, r31, r31 C L1 st slosh + addq r24, r21, r24 C U0 hi mul + carry +$ent6: + CMPCY( r4, r25), r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda rp, 64(rp) C L1 bookkeeping + ADDSUB r25, r24, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + CMPCY( r25, r22), r21 C L0 hi add => carry + addq r3, r20, r3 C U0 hi mul + carry + ldq r0, -16(up) C L1 + + bis r31, r31, r31 C U1 mt + ADDSUB r5, r28, r28 C L0 lo + acc + addq r3, r21, r3 C U0 hi mul + carry + ldq r4, -16(rp) C L1 + + umulh v0, r1, r24 C U1 + CMPCY( r5, r28), r20 C L0 lo add => carry + ADDSUB r28, r3, r23 C U0 hi add => answer + ldq r1, -8(up) C L1 + + mulq v0, r0, r25 C U1 + CMPCY( r28, r23), r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r5, -8(rp) C L1 + + umulh v0, r0, r3 C U1 + ADDSUB r4, r2, r2 C U0 lo + acc + stq r22, -32(rp) C L0 + stq r23, -24(rp) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq v0, r1, r28 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry + + CMPCY( r4, r2), r20 C L0 lo add => carry + ADDSUB r2, r8, r22 C U0 hi add => answer + ldl r31, 256(up) C prefetch up[] + bgt r18, $Loop C U1 bookkeeping + +$Lend: CMPCY( r2, r22), r21 C + addq r6, r20, r6 C + ADDSUB r5, r7, r7 C + addq r6, r21, r6 C + ldq r4, 0(rp) C + umulh v0, r1, r8 C + CMPCY( r5, r7), r20 C + ADDSUB r7, r6, r23 C + CMPCY(r7, r23), r21 C + addq r24, r20, r24 C + ldq r5, 8(rp) C + ADDSUB r4, r25, r25 C + stq r22, -16(rp) C + stq r23, -8(rp) C + addq r24, r21, r24 C + br L(x) + + ALIGN(16) +$n23: ldq r4, 0(rp) C + ldq r5, 8(rp) C + umulh v0, r1, r8 C + ADDSUB r4, r25, r25 C +L(x): CMPCY( r4, r25), r20 C + ADDSUB r25, r24, r22 C + CMPCY( r25, r22), r21 C + addq r3, r20, r3 C + ADDSUB r5, r28, r28 C + addq r3, r21, r3 C + CMPCY( r5, r28), r20 C + ADDSUB r28, r3, r23 C + CMPCY( r28, r23), r21 C + addq r8, r20, r8 C + stq r22, 0(rp) C + stq r23, 8(rp) C + addq r8, r21, r0 C + ret r31, (r26), 1 C +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h b/gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h new file mode 100644 index 0000000..e51d6b0 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/gmp-mparam.h @@ -0,0 +1,209 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#define DIVEXACT_BY3_METHOD 0 /* override ../diveby3.asm */ + +/* 500 MHz 21164 (agnesi.math.su.se) */ +/* FFT tuning limit = 20000000 */ +/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIV_QR_1N_PI1_METHOD 2 +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD 8 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 117 +#define MUL_TOOM44_THRESHOLD 124 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 59 +#define SQR_TOOM3_THRESHOLD 123 +#define SQR_TOOM4_THRESHOLD 163 +#define SQR_TOOM6_THRESHOLD 333 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 52 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 5 + +#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 468, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 19, 7}, { 10, 6}, \ + { 24, 7}, { 13, 6}, { 27, 7}, { 14, 6}, \ + { 29, 7}, { 17, 6}, { 35, 7}, { 29, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55, 9}, { 35, 8}, \ + { 71, 9}, { 39,10}, { 23, 9}, { 55,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \ + { 79,11}, { 47,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ + { 95,10}, { 199,11}, { 111,12}, { 63,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,11}, \ + { 207,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \ + { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 151 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 27, 7}, { 14, 6}, { 29, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 36, 8}, \ + { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 703,11}, { 1407,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1151,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 159 +#define SQR_FFT_THRESHOLD 5056 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 100 +#define MULLO_MUL_N_THRESHOLD 11355 + +#define DC_DIV_QR_THRESHOLD 124 +#define DC_DIVAPPR_Q_THRESHOLD 438 +#define DC_BDIV_QR_THRESHOLD 153 +#define DC_BDIV_Q_THRESHOLD 318 + +#define INV_MULMOD_BNM1_THRESHOLD 62 +#define INV_NEWTON_THRESHOLD 384 +#define INV_APPR_THRESHOLD 402 + +#define BINV_NEWTON_THRESHOLD 381 +#define REDC_1_TO_REDC_N_THRESHOLD 110 + +#define MU_DIV_QR_THRESHOLD 1752 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 174 +#define MU_BDIV_QR_THRESHOLD 1387 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define POWM_SEC_TABLE 1,13,66,82,579 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 318 +#define HGCD_APPR_THRESHOLD 363 +#define HGCD_REDUCE_THRESHOLD 2384 +#define GCD_DC_THRESHOLD 2504 +#define GCDEXT_DC_THRESHOLD 671 +#define JACOBI_BASE_METHOD 3 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 3754 +#define SET_STR_PRECOMPUTE_THRESHOLD 8097 + +#define FAC_DSC_THRESHOLD 951 +#define FAC_ODD_THRESHOLD 24 diff --git a/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm b/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm new file mode 100644 index 0000000..82c42ae --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm @@ -0,0 +1,336 @@ +dnl Alpha mpn_mod_1s_4p + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Optimise. 2.75 c/l should be possible. +C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated. +C * Optimise feed-in code, starting the sw pipeline in switch code. +C * Shorten software pipeline. The mul instructions are scheduled too far +C from their users. Fixing this will allow us to use fewer registers. +C * If we cannot reduce register usage, write perhaps small-n basecase. +C * Does this work for PIC? + +C cycles/limb +C EV4: ? +C EV5: 23 +C EV6: 3 + +define(`ap', `r16') +define(`n', `r17') +define(`pl', `r24') +define(`ph', `r25') +define(`rl', `r6') +define(`rh', `r7') +define(`B1modb', `r1') +define(`B2modb', `r2') +define(`B3modb', `r3') +define(`B4modb', `r4') +define(`B5modb', `r5') + +ASM_START() +PROLOGUE(mpn_mod_1s_4p) + lda r30, -64(r30) + stq r9, 8(r30) + ldq B1modb, 16(r19) + stq r10, 16(r30) + ldq B2modb, 24(r19) + stq r11, 24(r30) + ldq B3modb, 32(r19) + stq r12, 32(r30) + ldq B4modb, 40(r19) + stq r13, 40(r30) + ldq B5modb, 48(r19) + s8addq n, ap, ap C point ap at vector end + + and n, 3, r0 + lda n, -4(n) + beq r0, L(b0) + lda r6, -2(r0) + blt r6, L(b1) + beq r6, L(b2) + +L(b3): ldq r21, -16(ap) + ldq r22, -8(ap) + ldq r20, -24(ap) + mulq r21, B1modb, r8 + umulh r21, B1modb, r12 + mulq r22, B2modb, r9 + umulh r22, B2modb, r13 + addq r8, r20, pl + cmpult pl, r8, r0 + addq r0, r12, ph + addq r9, pl, rl + cmpult rl, r9, r0 + addq r13, ph, ph + addq r0, ph, rh + lda ap, -56(ap) + br L(com) + +L(b0): ldq r21, -24(ap) + ldq r22, -16(ap) + ldq r23, -8(ap) + ldq r20, -32(ap) + mulq r21, B1modb, r8 + umulh r21, B1modb, r12 + mulq r22, B2modb, r9 + umulh r22, B2modb, r13 + mulq r23, B3modb, r10 + umulh r23, B3modb, r27 + addq r8, r20, pl + cmpult pl, r8, r0 + addq r0, r12, ph + addq r9, pl, pl + cmpult pl, r9, r0 + addq r13, ph, ph + addq r0, ph, ph + addq r10, pl, rl + cmpult rl, r10, r0 + addq r27, ph, ph + addq r0, ph, rh + lda ap, -64(ap) + br L(com) + +L(b1): bis r31, r31, rh + ldq rl, -8(ap) + lda ap, -40(ap) + br L(com) + +L(b2): ldq rh, -8(ap) + ldq rl, -16(ap) + lda ap, -48(ap) + +L(com): ble n, L(ed3) + ldq r21, 8(ap) + ldq r22, 16(ap) + ldq r23, 24(ap) + ldq r20, 0(ap) + lda n, -4(n) + lda ap, -32(ap) + mulq r21, B1modb, r8 + umulh r21, B1modb, r12 + mulq r22, B2modb, r9 + umulh r22, B2modb, r13 + mulq r23, B3modb, r10 + umulh r23, B3modb, r27 + mulq rl, B4modb, r11 + umulh rl, B4modb, r28 + ble n, L(ed2) + + ALIGN(16) +L(top): ldq r21, 8(ap) + mulq rh, B5modb, rl + addq r8, r20, pl + ldq r22, 16(ap) + cmpult pl, r8, r0 + umulh rh, B5modb, rh + ldq r23, 24(ap) + addq r0, r12, ph + addq r9, pl, pl + mulq r21, B1modb, r8 + cmpult pl, r9, r0 + addq r13, ph, ph + umulh r21, B1modb, r12 + lda ap, -32(ap) + addq r0, ph, ph + addq r10, pl, pl + mulq r22, B2modb, r9 + cmpult pl, r10, r0 + addq r27, ph, ph + addq r11, pl, pl + umulh r22, B2modb, r13 + addq r0, ph, ph + cmpult pl, r11, r0 + addq r28, ph, ph + mulq r23, B3modb, r10 + ldq r20, 32(ap) + addq pl, rl, rl + umulh r23, B3modb, r27 + addq r0, ph, ph + cmpult rl, pl, r0 + mulq rl, B4modb, r11 + addq ph, rh, rh + umulh rl, B4modb, r28 + addq r0, rh, rh + lda n, -4(n) + bgt n, L(top) + +L(ed2): mulq rh, B5modb, rl + addq r8, r20, pl + umulh rh, B5modb, rh + cmpult pl, r8, r0 + addq r0, r12, ph + addq r9, pl, pl + cmpult pl, r9, r0 + addq r13, ph, ph + addq r0, ph, ph + addq r10, pl, pl + cmpult pl, r10, r0 + addq r27, ph, ph + addq r11, pl, pl + addq r0, ph, ph + cmpult pl, r11, r0 + addq r28, ph, ph + addq pl, rl, rl + addq r0, ph, ph + cmpult rl, pl, r0 + addq ph, rh, rh + addq r0, rh, rh + +L(ed3): mulq rh, B1modb, r8 + umulh rh, B1modb, rh + addq r8, rl, rl + cmpult rl, r8, r0 + addq r0, rh, rh + + ldq r24, 8(r19) C cnt + sll rh, r24, rh + subq r31, r24, r25 + srl rl, r25, r2 + sll rl, r24, rl + or r2, rh, rh + + ldq r23, 0(r19) C bi + mulq rh, r23, r8 + umulh rh, r23, r9 + addq rh, 1, r7 + addq r8, rl, r8 C ql + cmpult r8, rl, r0 + addq r9, r7, r9 + addq r0, r9, r9 C qh + mulq r9, r18, r21 C qh * b + subq rl, r21, rl + cmpult r8, rl, r0 C rl > ql + negq r0, r0 + and r0, r18, r0 + addq rl, r0, rl + cmpule r18, rl, r0 C rl >= b + negq r0, r0 + and r0, r18, r0 + subq rl, r0, rl + + srl rl, r24, r0 + + ldq r9, 8(r30) + ldq r10, 16(r30) + ldq r11, 24(r30) + ldq r12, 32(r30) + ldq r13, 40(r30) + lda r30, 64(r30) + ret r31, (r26), 1 +EPILOGUE() + +PROLOGUE(mpn_mod_1s_4p_cps,gp) + lda r30, -32(r30) + stq r26, 0(r30) + stq r9, 8(r30) + stq r10, 16(r30) + stq r11, 24(r30) + mov r16, r11 + LEA( r4, __clz_tab) + lda r10, 65(r31) + cmpbge r31, r17, r1 + srl r1, 1, r1 + xor r1, 127, r1 + addq r1, r4, r1 + ldq_u r2, 0(r1) + extbl r2, r1, r2 + s8subq r2, 7, r2 + srl r17, r2, r3 + subq r10, r2, r10 + addq r3, r4, r3 + ldq_u r1, 0(r3) + extbl r1, r3, r1 + subq r10, r1, r10 + sll r17, r10, r9 + mov r9, r16 + jsr r26, mpn_invert_limb + LDGP( r29, 0(r26)) + subq r31, r10, r2 + lda r1, 1(r31) + sll r1, r10, r1 + subq r31, r9, r3 + srl r0, r2, r2 + ldq r26, 0(r30) + bis r2, r1, r2 + stq r0, 0(r11) + stq r10, 8(r11) + mulq r2, r3, r2 + srl r2, r10, r3 + umulh r2, r0, r1 + stq r3, 16(r11) + mulq r2, r0, r3 + ornot r31, r1, r1 + subq r1, r2, r1 + mulq r1, r9, r1 + addq r1, r9, r2 + cmpule r1, r3, r3 + cmoveq r3, r2, r1 + srl r1, r10, r3 + umulh r1, r0, r2 + stq r3, 24(r11) + mulq r1, r0, r3 + ornot r31, r2, r2 + subq r2, r1, r2 + mulq r2, r9, r2 + addq r2, r9, r1 + cmpule r2, r3, r3 + cmoveq r3, r1, r2 + srl r2, r10, r1 + umulh r2, r0, r3 + stq r1, 32(r11) + mulq r2, r0, r1 + ornot r31, r3, r3 + subq r3, r2, r3 + mulq r3, r9, r3 + addq r3, r9, r2 + cmpule r3, r1, r1 + cmoveq r1, r2, r3 + srl r3, r10, r2 + umulh r3, r0, r1 + stq r2, 40(r11) + mulq r3, r0, r0 + ornot r31, r1, r1 + subq r1, r3, r1 + mulq r1, r9, r1 + addq r1, r9, r9 + cmpule r1, r0, r0 + cmoveq r0, r9, r1 + ldq r9, 8(r30) + srl r1, r10, r1 + ldq r10, 16(r30) + stq r1, 48(r11) + ldq r11, 24(r30) + lda r30, 32(r30) + ret r31, (r26), 1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/alpha/ev6/mul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/mul_1.asm new file mode 100644 index 0000000..8ee19cd --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/mul_1.asm @@ -0,0 +1,496 @@ +dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r16 +C s1_ptr r17 +C size r18 +C s2_limb r19 + +C This code runs at 2.25 cycles/limb on EV6. + +C This code was written in close cooperation with ev6 pipeline expert +C Steve Root. Any errors are tege's fault, though. + +C Code structure: + +C code for n < 8 +C code for n > 8 code for (n mod 8) +C code for (n div 8) feed-in code +C 8-way unrolled loop +C wind-down code + +C Some notes about unrolled loop: +C +C r1-r8 multiplies and workup +C r21-r28 multiplies and workup +C r9-r12 loads +C r0 -1 +C r20,r29,r13-r15 scramble +C +C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a +C put-the-carry-into-hi. The idea is that these branches are very rarely +C taken, and since a non-taken branch consumes no resources, that is better +C than an addq. +C +C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an +C add NEXT cycle #09 which feeds a store in NEXT cycle #02 + +C The code could use some further work: +C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is +C faster than this for size < 3. +C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless +C that is too costly. +C 3. Consider using 4-way unrolling, even if that runs slower. +C 4. Reduce register usage. In particular, try to avoid using r29. + +ASM_START() +PROLOGUE(mpn_mul_1) + cmpult r18, 8, r1 + beq r1, $Large +$Lsmall: + ldq r2,0(r17) C r2 = s1_limb + lda r18,-1(r18) C size-- + mulq r2,r19,r3 C r3 = prod_low + bic r31,r31,r4 C clear cy_limb + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Le1a C jump if size was == 1 + ldq r2,8(r17) C r2 = s1_limb + lda r18,-1(r18) C size-- + stq r3,0(r16) + beq r18,$Le2a C jump if size was == 2 + ALIGN(8) +$Lopa: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + lda r18,-1(r18) C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,16(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,8(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + lda r16,8(r16) C res_ptr++ + bne r18,$Lopa + +$Le2a: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,8(r16) + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Le1a: stq r3,0(r16) + ret r31,(r26),1 + +$Large: + lda r30, -224(r30) + stq r26, 0(r30) + stq r9, 8(r30) + stq r10, 16(r30) + stq r11, 24(r30) + stq r12, 32(r30) + stq r13, 40(r30) + stq r14, 48(r30) + stq r15, 56(r30) + stq r29, 64(r30) + + and r18, 7, r20 C count for the first loop, 0-7 + srl r18, 3, r18 C count for unrolled loop + bis r31, r31, r21 + beq r20, $L_8_or_more C skip first loop + +$L_9_or_more: + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + lda r20,-1(r20) C size-- + mulq r2,r19,r3 C r3 = prod_low + umulh r2,r19,r21 C r21 = prod_high + beq r20,$Le1b C jump if size was == 1 + bis r31, r31, r0 C FIXME: shouldn't need this + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + lda r20,-1(r20) C size-- + stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + beq r20,$Le2b C jump if size was == 2 + ALIGN(8) +$Lopb: mulq r2,r19,r3 C r3 = prod_low + addq r21,r0,r0 C cy_limb = cy_limb + 'cy' + lda r20,-1(r20) C size-- + umulh r2,r19,r21 C r21 = prod_high + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,0(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + lda r16,8(r16) C res_ptr++ + bne r20,$Lopb + +$Le2b: mulq r2,r19,r3 C r3 = prod_low + addq r21,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r21 C r21 = prod_high + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + addq r21,r0,r21 C cy_limb = prod_high + cy + br r31, $L_8_or_more +$Le1b: stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + +$L_8_or_more: + lda r0, -1(r31) C put -1 in r0, for tricky loop control + lda r17, -32(r17) C L1 bookkeeping + lda r18, -1(r18) C decrement count + + ldq r9, 32(r17) C L1 + ldq r10, 40(r17) C L1 + mulq r9, r19, r22 C U1 #07 + ldq r11, 48(r17) C L1 + umulh r9, r19, r23 C U1 #08 + ldq r12, 56(r17) C L1 + mulq r10, r19, r24 C U1 #09 + ldq r9, 64(r17) C L1 + + lda r17, 64(r17) C L1 bookkeeping + + umulh r10, r19, r25 C U1 #11 + mulq r11, r19, r26 C U1 #12 + umulh r11, r19, r27 C U1 #13 + mulq r12, r19, r28 C U1 #14 + ldq r10, 8(r17) C L1 + umulh r12, r19, r1 C U1 #15 + ldq r11, 16(r17) C L1 + mulq r9, r19, r2 C U1 #16 + ldq r12, 24(r17) C L1 + umulh r9, r19, r3 C U1 #17 + addq r21, r22, r13 C L1 mov + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + bgt r18, $L_16_or_more + + cmpult r22, r24, r24 C U0 carry from sum + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 32(r16) C L1 bookkeeping + addq r13, r31, r13 C U0 start carry cascade + umulh r12, r19, r21 C U1 #06 + br r31, $ret0c + +$L_16_or_more: +C --------------------------------------------------------------- + subq r18,1,r18 + cmpult r22, r24, r24 C U0 carry from sum + ldq r9, 32(r17) C L1 + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 32(r16) C L1 bookkeeping + addq r13, r31, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 +C beq r13, $fix0w C U0 +$ret0w: addq r22, r14, r26 C L0 + ldq r10, 40(r17) C L1 + + mulq r9, r19, r22 C U1 #07 + beq r26, $fix1w C U0 +$ret1w: addq r23, r24, r27 C L0 + ldq r11, 48(r17) C L1 + + umulh r9, r19, r23 C U1 #08 + beq r27, $fix2w C U0 +$ret2w: addq r28, r25, r28 C L0 + ldq r12, 56(r17) C L1 + + mulq r10, r19, r24 C U1 #09 + beq r28, $fix3w C U0 +$ret3w: addq r1, r2, r20 C L0 sum 2 mul's + ldq r9, 64(r17) C L1 + + addq r3, r4, r2 C L0 #10 2 mul's + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + + umulh r10, r19, r25 C U1 #11 + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + + mulq r11, r19, r26 C U1 #12 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + + umulh r11, r19, r27 C U1 #13 + cmpult r14, r6, r3 C U0 carry from sum +C could do cross-jumping here: +C bra $L_middle_of_unrolled_loop + mulq r12, r19, r28 C U1 #14 + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + ldq r10, 8(r17) C L1 + + umulh r12, r19, r1 C U1 #15 + beq r20, $fix4 C U0 +$ret4w: addq r2, r29, r6 C L0 + ldq r11, 16(r17) C L1 + + mulq r9, r19, r2 C U1 #16 + beq r6, $fix5 C U0 +$ret5w: addq r14, r4, r7 C L0 + ldq r12, 24(r17) C L1 + + umulh r9, r19, r3 C U1 #17 + beq r7, $fix6 C U0 +$ret6w: addq r5, r8, r8 C L0 sum 2 + addq r21, r22, r13 C L1 sum 2 mul's + + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + ble r18, $Lend C U0 +C --------------------------------------------------------------- + ALIGN(16) +$Loop: + umulh r0, r18, r18 C U1 #01 decrement r18! + cmpult r8, r5, r29 C L0 carry from last bunch + cmpult r22, r24, r24 C U0 carry from sum + ldq r9, 32(r17) C L1 + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + + umulh r11, r19, r7 C U1 #04 + bis r31, r31, r31 C L0 st slosh + bis r31, r31, r31 C L1 st slosh + addq r27, r28, r28 C U0 sum 2 mul's + + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 64(r16) C L1 bookkeeping + addq r13, r29, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 + beq r13, $fix0 C U0 +$ret0: addq r22, r14, r26 C L0 + ldq r10, 40(r17) C L1 + + mulq r9, r19, r22 C U1 #07 + beq r26, $fix1 C U0 +$ret1: addq r23, r24, r27 C L0 + ldq r11, 48(r17) C L1 + + umulh r9, r19, r23 C U1 #08 + beq r27, $fix2 C U0 +$ret2: addq r28, r25, r28 C L0 + ldq r12, 56(r17) C L1 + + mulq r10, r19, r24 C U1 #09 + beq r28, $fix3 C U0 +$ret3: addq r1, r2, r20 C L0 sum 2 mul's + ldq r9, 64(r17) C L1 + + addq r3, r4, r2 C L0 #10 2 mul's + bis r31, r31, r31 C U1 mul hole + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + + umulh r10, r19, r25 C U1 #11 + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + + mulq r11, r19, r26 C U1 #12 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + + umulh r11, r19, r27 C U1 #13 + bis r31, r31, r31 C L0 st slosh + bis r31, r31, r31 C L1 st slosh + cmpult r14, r6, r3 C U0 carry from sum +$L_middle_of_unrolled_loop: + mulq r12, r19, r28 C U1 #14 + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + ldq r10, 8(r17) C L1 + + umulh r12, r19, r1 C U1 #15 + beq r20, $fix4 C U0 +$ret4: addq r2, r29, r6 C L0 + ldq r11, 16(r17) C L1 + + mulq r9, r19, r2 C U1 #16 + beq r6, $fix5 C U0 +$ret5: addq r14, r4, r7 C L0 + ldq r12, 24(r17) C L1 + + umulh r9, r19, r3 C U1 #17 + beq r7, $fix6 C U0 +$ret6: addq r5, r8, r8 C L0 sum 2 + addq r21, r22, r13 C L1 sum 2 mul's + + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + bgt r18, $Loop C U0 +C --------------------------------------------------------------- +$Lend: + cmpult r8, r5, r29 C L0 carry from last bunch + cmpult r22, r24, r24 C U0 carry from sum + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 64(r16) C L1 bookkeeping + addq r13, r29, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 + beq r13, $fix0c C U0 +$ret0c: addq r22, r14, r26 C L0 + beq r26, $fix1c C U0 +$ret1c: addq r23, r24, r27 C L0 + beq r27, $fix2c C U0 +$ret2c: addq r28, r25, r28 C L0 + beq r28, $fix3c C U0 +$ret3c: addq r1, r2, r20 C L0 sum 2 mul's + addq r3, r4, r2 C L0 #10 2 mul's + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + cmpult r14, r6, r3 C U0 carry from sum + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + beq r20, $fix4c C U0 +$ret4c: addq r2, r29, r6 C L0 + beq r6, $fix5c C U0 +$ret5c: addq r14, r4, r7 C L0 + beq r7, $fix6c C U0 +$ret6c: addq r5, r8, r8 C L0 sum 2 + cmpult r8, r5, r29 C L0 carry from last bunch + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + addq r29, r21, r0 + + ldq r26, 0(r30) + ldq r9, 8(r30) + ldq r10, 16(r30) + ldq r11, 24(r30) + ldq r12, 32(r30) + ldq r13, 40(r30) + ldq r14, 48(r30) + ldq r15, 56(r30) + ldq r29, 64(r30) + lda r30, 224(r30) + ret r31, (r26), 1 + +C $fix0w: bis r14, r29, r14 C join carries +C br r31, $ret0w +$fix1w: bis r24, r14, r24 C join carries + br r31, $ret1w +$fix2w: bis r25, r24, r25 C join carries + br r31, $ret2w +$fix3w: bis r15, r25, r15 C join carries + br r31, $ret3w +$fix0: bis r14, r29, r14 C join carries + br r31, $ret0 +$fix1: bis r24, r14, r24 C join carries + br r31, $ret1 +$fix2: bis r25, r24, r25 C join carries + br r31, $ret2 +$fix3: bis r15, r25, r15 C join carries + br r31, $ret3 +$fix4: bis r29, r15, r29 C join carries + br r31, $ret4 +$fix5: bis r4, r29, r4 C join carries + br r31, $ret5 +$fix6: addq r5, r4, r5 C can't carry twice! + br r31, $ret6 +$fix0c: bis r14, r29, r14 C join carries + br r31, $ret0c +$fix1c: bis r24, r14, r24 C join carries + br r31, $ret1c +$fix2c: bis r25, r24, r25 C join carries + br r31, $ret2c +$fix3c: bis r15, r25, r15 C join carries + br r31, $ret3c +$fix4c: bis r29, r15, r29 C join carries + br r31, $ret4c +$fix5c: bis r4, r29, r4 C join carries + br r31, $ret5c +$fix6c: addq r5, r4, r5 C can't carry twice! + br r31, $ret6c + +EPILOGUE(mpn_mul_1) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/README b/gmp-6.3.0/mpn/alpha/ev6/nails/README new file mode 100644 index 0000000..b214ac5 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/README @@ -0,0 +1,65 @@ +Copyright 2002, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains assembly code for nails-enabled 21264. The code is not +very well optimized. + +For addmul_N, as N grows larger, we could make multiple loads together, then do +about 3.3 i/c. 10 cycles after the last load, we can increase to 4 i/c. This +would surely allow addmul_4 to run at 2 c/l, but the same should be possible +also for addmul_3 and perhaps even addmul_2. + + + current fair best +Routine c/l unroll c/l unroll c/l i/c +mul_1 3.25 2.75 2.75 3.273 +addmul_1 4.0 4 3.5 4 14 3.25 3.385 +addmul_2 4.0 1 2.5 2 10 2.25 3.333 +addmul_3 3.0 1 2.33 2 14 2 3.333 +addmul_4 2.5 1 2.125 2 17 2 3.135 + +addmul_5 2 1 10 +addmul_6 2 1 12 +addmul_7 2 1 14 + +(The "best" column doesn't account for bookkeeping instructions and +thereby assumes infinite unrolling.) + +Basecase usages: + +1 addmul_1 +2 addmul_2 +3 addmul_3 +4 addmul_4 +5 addmul_3 + addmul_2 2.3998 +6 addmul_4 + addmul_2 +7 addmul_4 + addmul_3 diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm new file mode 100644 index 0000000..711d4e6 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_1.asm @@ -0,0 +1,396 @@ +dnl Alpha ev6 nails mpn_addmul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 4 + +C TODO +C * Reroll loop for 3.75 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(2-63) + +ASM_START() +PROLOGUE(mpn_addmul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq rl1, t0, acc1 + and acc1,numb_mask, r28 + srl acc1,NUMB_BITS, t1 + stq r28, 24(rp) + addq t1, m1b, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + addq rl1, acc1, acc1 + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + addq rl1, acc1, acc1 C U0 + ldq rl2, 0(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + addq rl2, acc0, acc0 C U0 + ldq rl3, 8(rp) C L1 +C + lda n, -4(n) C L1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + addq rl3, acc1, acc1 C U0 + ldq rl0, 16(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + addq rl0, acc0, acc0 C U0 + ldq rl1, 24(rp) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + addq rl1, acc1, acc1 + ldq rl2, 0(rp) + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + addq rl2, acc0, acc0 + ldq rl3, 8(rp) + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -8(rp) + unop + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + addq rl3, acc1, acc1 + ldq rl0, 16(rp) + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + addq rl0, acc0, acc0 + ldq rl1, 24(rp) + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + addq rl1, acc1, acc1 + addq t1, acc1, acc1 + srl acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + addq t1, m1b, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm new file mode 100644 index 0000000..6ff6b3a --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_2.asm @@ -0,0 +1,146 @@ +dnl Alpha ev6 nails mpn_addmul_2. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 4.0 cycles/limb. + +C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l, +C or 4-way unrolling over 20 cycles, for 2.5 c/l. + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') + +define(`acc0',`r4') +define(`acc1',`r5') + +define(`v0',`r6') +define(`v1',`r7') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(3-63) + +ASM_START() +PROLOGUE(mpn_addmul_2) + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + beq n, L(end) C U0 + + ALIGN(16) +L(top): bis r31, r31, r31 C U1 nop + addq r19, acc0, acc0 C U0 propagate nail + ldq rlimb, 0(rp) C L0 + ldq ulimb, 0(up) C L1 + + lda rp, 8(rp) C L1 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L0 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L1 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L0 nop + + addq rlimb, r19, r19 C L1 FINAL PROD-SUM + srl m1a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L0 + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + bis r31, m1b, acc1 C L1 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L0 extract numb part + + unop + srl r19,NUMB_BITS, r19 C U1 extract nail part + stq r28, -8(rp) C L1 + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + bis r31, m1b, acc1 + and r19,numb_mask, r28 C extract limb + + srl r19,NUMB_BITS, r19 C extract nail + stq r28, -8(rp) + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, r0 + + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm new file mode 100644 index 0000000..a1ffb68 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_3.asm @@ -0,0 +1,169 @@ +dnl Alpha ev6 nails mpn_addmul_3. + +dnl Copyright 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 3.0 cycles/limb. + +C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c). + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') + +define(`acc0',`r4') +define(`acc1',`r5') +define(`acc2',`r22') + +define(`v0',`r6') +define(`v1',`r7') +define(`v2',`r23') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(3-63) + +ASM_START() +PROLOGUE(mpn_addmul_3) + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + ldq v2, 16(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, acc2 C zero acc2 + sll v2,NAIL_BITS, v2 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + mulq v2, ulimb, m2a C U1 + umulh v2, ulimb, m2b C U1 + beq n, L(end) C U0 + + ALIGN(16) +L(top): ldq rlimb, 0(rp) C L1 + ldq ulimb, 0(up) C L0 + bis r31, r31, r31 C U0 nop + addq r19, acc0, acc0 C U1 propagate nail + + lda rp, 8(rp) C L1 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L0 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L1 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L0 nop + + addq rlimb, r19, r19 C L1 + srl m1a,NAIL_BITS, r8 C U0 + bis r31, r31, r31 C L0 nop + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + addq m1b, acc2, acc1 C L1 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L0 extract numb part + + bis r31, r31, r31 C L1 nop + srl m2a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L0 + mulq v2, ulimb, m2a C U1 + + addq r8, acc1, acc1 C L0 + bis r31, m2b, acc2 C L1 + umulh v2, ulimb, m2b C U1 + srl r19,NUMB_BITS, r19 C U0 extract nail part + + stq r28, -8(rp) C L + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + addq m1b, acc2, acc1 + and r19,numb_mask, r28 C extract limb + srl m2a,NAIL_BITS, r8 C U0 + addq r8, acc1, acc1 + bis r31, m2b, acc2 + srl r19,NUMB_BITS, r19 C extract nail + stq r28, -8(rp) + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, acc1 + + and acc1,numb_mask, r28 + stq r28, 8(rp) + srl acc1,NUMB_BITS, r19 + addq r19, acc2, m0a + + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm new file mode 100644 index 0000000..77e02a4 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/addmul_4.asm @@ -0,0 +1,210 @@ +dnl Alpha ev6 nails mpn_addmul_4. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Runs at 2.5 cycles/limb. + +C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding +C to 3.24 insn/cycle. + + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n',`r18') +define(`vp',`r19') + +C Useful register aliases +define(`numb_mask',`r24') +define(`ulimb',`r25') +define(`rlimb',`r27') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r12') +define(`m3b',`r13') + +define(`acc0',`r4') +define(`acc1',`r5') +define(`acc2',`r22') +define(`acc3',`r14') + +define(`v0',`r6') +define(`v1',`r7') +define(`v2',`r23') +define(`v3',`r15') + +C Used for temps: r8 r19 r28 + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +C This declaration is munged by configure +NAILS_SUPPORT(4-63) + +ASM_START() +PROLOGUE(mpn_addmul_4) + lda r30, -240(r30) + stq r12, 32(r30) + stq r13, 40(r30) + stq r14, 48(r30) + stq r15, 56(r30) + + lda numb_mask,-1(r31) + srl numb_mask,NAIL_BITS,numb_mask + + ldq v0, 0(vp) + ldq v1, 8(vp) + ldq v2, 16(vp) + ldq v3, 24(vp) + + bis r31, r31, acc0 C zero acc0 + sll v0,NAIL_BITS, v0 + bis r31, r31, acc1 C zero acc1 + sll v1,NAIL_BITS, v1 + bis r31, r31, acc2 C zero acc2 + sll v2,NAIL_BITS, v2 + bis r31, r31, acc3 C zero acc3 + sll v3,NAIL_BITS, v3 + bis r31, r31, r19 + + ldq ulimb, 0(up) + lda up, 8(up) + mulq v0, ulimb, m0a C U1 + umulh v0, ulimb, m0b C U1 + mulq v1, ulimb, m1a C U1 + umulh v1, ulimb, m1b C U1 + lda n, -1(n) + mulq v2, ulimb, m2a C U1 + umulh v2, ulimb, m2b C U1 + mulq v3, ulimb, m3a C U1 + umulh v3, ulimb, m3b C U1 + beq n, L(end) C U0 + + ALIGN(16) +L(top): bis r31, r31, r31 C U1 nop + ldq rlimb, 0(rp) C L0 + ldq ulimb, 0(up) C L1 + addq r19, acc0, acc0 C U0 propagate nail + + bis r31, r31, r31 C L0 nop + bis r31, r31, r31 C U1 nop + bis r31, r31, r31 C L1 nop + bis r31, r31, r31 C U0 nop + + lda rp, 8(rp) C L0 + srl m0a,NAIL_BITS, r8 C U0 + lda up, 8(up) C L1 + mulq v0, ulimb, m0a C U1 + + addq r8, acc0, r19 C U0 + addq m0b, acc1, acc0 C L0 + umulh v0, ulimb, m0b C U1 + bis r31, r31, r31 C L1 nop + + addq rlimb, r19, r19 C L0 + srl m1a,NAIL_BITS, r8 C U0 + bis r31, r31, r31 C L1 nop + mulq v1, ulimb, m1a C U1 + + addq r8, acc0, acc0 C U0 + addq m1b, acc2, acc1 C L0 + umulh v1, ulimb, m1b C U1 + and r19,numb_mask, r28 C L1 extract numb part + + bis r31, r31, r31 C L0 nop + srl m2a,NAIL_BITS, r8 C U0 + lda n, -1(n) C L1 + mulq v2, ulimb, m2a C U1 + + addq r8, acc1, acc1 C L1 + addq m2b, acc3, acc2 C L0 + umulh v2, ulimb, m2b C U1 + srl r19,NUMB_BITS, r19 C U0 extract nail part + + bis r31, r31, r31 C L0 nop + srl m3a,NAIL_BITS, r8 C U0 + stq r28, -8(rp) C L1 + mulq v3, ulimb, m3a C U1 + + addq r8, acc2, acc2 C L0 + bis r31, m3b, acc3 C L1 + umulh v3, ulimb, m3b C U1 + bne n, L(top) C U0 + +L(end): ldq rlimb, 0(rp) + addq r19, acc0, acc0 C propagate nail + lda rp, 8(rp) C FIXME: DELETE + srl m0a,NAIL_BITS, r8 C U0 + addq r8, acc0, r19 + addq m0b, acc1, acc0 + addq rlimb, r19, r19 + srl m1a,NAIL_BITS, r8 C U0 + addq r8, acc0, acc0 + addq m1b, acc2, acc1 + and r19,numb_mask, r28 C extract limb + srl m2a,NAIL_BITS, r8 C U0 + addq r8, acc1, acc1 + addq m2b, acc3, acc2 + srl r19,NUMB_BITS, r19 C extract nail + srl m3a,NAIL_BITS, r8 C U0 + stq r28, -8(rp) + addq r8, acc2, acc2 + bis r31, m3b, acc3 + + addq r19, acc0, acc0 C propagate nail + and acc0,numb_mask, r28 + stq r28, 0(rp) + srl acc0,NUMB_BITS, r19 + addq r19, acc1, acc1 + + and acc1,numb_mask, r28 + stq r28, 8(rp) + srl acc1,NUMB_BITS, r19 + addq r19, acc2, acc2 + + and acc2,numb_mask, r28 + stq r28, 16(rp) + srl acc2,NUMB_BITS, r19 + addq r19, acc3, r0 + + ldq r12, 32(r30) + ldq r13, 40(r30) + ldq r14, 48(r30) + ldq r15, 56(r30) + lda r30, 240(r30) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm new file mode 100644 index 0000000..f658677 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/aors_n.asm @@ -0,0 +1,233 @@ +dnl Alpha ev6 nails mpn_add_n and mpn_sub_n. + +dnl Copyright 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb +dnl with 8-way unrolling. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`vp',`r18') +define(`n',`r19') + +define(`rl0',`r0') +define(`rl1',`r1') +define(`rl2',`r2') +define(`rl3',`r3') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r6') +define(`ul3',`r7') + +define(`vl0',`r22') +define(`vl1',`r23') +define(`vl2',`r24') +define(`vl3',`r25') + +define(`numb_mask',`r21') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`CYSH',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(1-63) + +ifdef(`OPERATION_add_n', ` + define(`OP', addq) + define(`CYSH',`GMP_NUMB_BITS') + define(`func', mpn_add_n)') +ifdef(`OPERATION_sub_n', ` + define(`OP', subq) + define(`CYSH',63) + define(`func', mpn_sub_n)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) + +ASM_START() +PROLOGUE(func) + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + bis r31, r31, r20 + + and n, 3, r25 + lda n, -4(n) + beq r25, L(ge4) + +L(lp0): ldq ul0, 0(up) + lda up, 8(up) + ldq vl0, 0(vp) + lda vp, 8(vp) + lda rp, 8(rp) + lda r25, -1(r25) + OP ul0, vl0, rl0 + OP rl0, r20, rl0 + and rl0, numb_mask, r28 + stq r28, -8(rp) + srl rl0, CYSH, r20 + bne r25, L(lp0) + + blt n, L(ret) + +L(ge4): ldq ul0, 0(up) + ldq vl0, 0(vp) + ldq ul1, 8(up) + ldq vl1, 8(vp) + ldq ul2, 16(up) + ldq vl2, 16(vp) + ldq ul3, 24(up) + ldq vl3, 24(vp) + lda up, 32(up) + lda vp, 32(vp) + lda n, -4(n) + bge n, L(ge8) + + OP ul0, vl0, rl0 C main-add 0 + OP rl0, r20, rl0 C cy-add 0 + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + br r31, L(cj0) + +L(ge8): OP ul0, vl0, rl0 C main-add 0 + ldq ul0, 0(up) + ldq vl0, 0(vp) + OP rl0, r20, rl0 C cy-add 0 + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + ldq ul1, 8(up) + ldq vl1, 8(vp) + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + ldq ul2, 16(up) + ldq vl2, 16(vp) + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + ldq ul3, 24(up) + ldq vl3, 24(vp) + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + lda rp, 32(rp) + lda up, 32(up) + lda vp, 32(vp) + lda n, -4(n) + blt n, L(end) + + ALIGN(32) +L(top): OP ul0, vl0, rl0 C main-add 0 + srl rl3, CYSH, r20 C gen cy 3 + ldq ul0, 0(up) + ldq vl0, 0(vp) + + OP rl0, r20, rl0 C cy-add 0 + and rl3,numb_mask, r28 + stq r27, -16(rp) + bis r31, r31, r31 + + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + ldq ul1, 8(up) + ldq vl1, 8(vp) + + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + stq r28, -8(rp) + bis r31, r31, r31 + + OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + ldq ul2, 16(up) + ldq vl2, 16(vp) + + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + bis r31, r31, r31 + + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + ldq ul3, 24(up) + ldq vl3, 24(vp) + + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + bis r31, r31, r31 + + bis r31, r31, r31 + lda n, -4(n) + lda up, 32(up) + lda vp, 32(vp) + + bis r31, r31, r31 + bis r31, r31, r31 + lda rp, 32(rp) + bge n, L(top) + +L(end): OP ul0, vl0, rl0 C main-add 0 + srl rl3, CYSH, r20 C gen cy 3 + OP rl0, r20, rl0 C cy-add 0 + and rl3,numb_mask, r28 + stq r27, -16(rp) + OP ul1, vl1, rl1 C main-add 1 + srl rl0, CYSH, r20 C gen cy 0 + OP rl1, r20, rl1 C cy-add 1 + and rl0,numb_mask, r27 + stq r28, -8(rp) +L(cj0): OP ul2, vl2, rl2 C main-add 2 + srl rl1, CYSH, r20 C gen cy 1 + OP rl2, r20, rl2 C cy-add 2 + and rl1,numb_mask, r28 + stq r27, 0(rp) + OP ul3, vl3, rl3 C main-add 3 + srl rl2, CYSH, r20 C gen cy 2 + OP rl3, r20, rl3 C cy-add 3 + and rl2,numb_mask, r27 + stq r28, 8(rp) + + srl rl3, CYSH, r20 C gen cy 3 + and rl3,numb_mask, r28 + stq r27, 16(rp) + stq r28, 24(rp) + +L(ret): and r20, 1, r0 + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h b/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h new file mode 100644 index 0000000..7949fe8 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/gmp-mparam.h @@ -0,0 +1,72 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */ + +#define MUL_TOOM22_THRESHOLD 40 +#define MUL_TOOM33_THRESHOLD 236 + +#define SQR_BASECASE_THRESHOLD 7 /* karatsuba */ +#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */ +#define SQR_TOOM3_THRESHOLD 120 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIV_DC_THRESHOLD 48 +#define POWM_THRESHOLD 113 + +#define HGCD_THRESHOLD 78 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 392 +#define JACOBI_BASE_METHOD 1 + +#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define USE_PREINV_DIVREM_1 0 /* no preinv with nails */ +#define USE_PREINV_MOD_1 0 /* no preinv with nails */ +#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_THRESHOLD 6336 + +#define MUL_FFT_TABLE { 688, 1440, 3648, 6400, 25600, 0 } +#define MUL_FFT_MODF_THRESHOLD 488 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_TABLE { 432, 864, 3136, 6400, 25600, 0 } +#define SQR_FFT_MODF_THRESHOLD 480 +#define SQR_FFT_THRESHOLD 2976 diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm new file mode 100644 index 0000000..da2ee3d --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/mul_1.asm @@ -0,0 +1,364 @@ +dnl Alpha ev6 nails mpn_mul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 3.25 + +C TODO +C * Reroll loop for 3.0 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(1-63) + +ASM_START() +PROLOGUE(mpn_mul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + srl m1a,NAIL_BITS, t0 + addq t0, r31, acc1 + and acc1,numb_mask, r28 + srl acc1,NUMB_BITS, t1 + stq r28, 24(rp) + addq t1, m1b, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + unop C U0 + lda n, -4(n) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + srl acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + srl acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + unop C U0 + unop C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, -8(rp) + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + srl acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + srl acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + addq t1, acc1, acc1 + srl acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + addq t1, m1b, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm b/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm new file mode 100644 index 0000000..f473a59 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/nails/submul_1.asm @@ -0,0 +1,396 @@ +dnl Alpha ev6 nails mpn_submul_1. + +dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 4 + +C TODO +C * Reroll loop for 3.75 c/l with current 4-way unrolling. +C * The loop is overscheduled wrt loads and wrt multiplies, in particular +C umulh. +C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 +C and would work since the loop structure is really regular. + +C INPUT PARAMETERS +define(`rp',`r16') +define(`up',`r17') +define(`n', `r18') +define(`vl0',`r19') + +define(`numb_mask',`r6') + +define(`m0a',`r0') +define(`m0b',`r1') +define(`m1a',`r2') +define(`m1b',`r3') +define(`m2a',`r20') +define(`m2b',`r21') +define(`m3a',`r22') +define(`m3b',`r23') + +define(`acc0',`r25') +define(`acc1',`r27') + +define(`ul0',`r4') +define(`ul1',`r5') +define(`ul2',`r4') +define(`ul3',`r5') + +define(`rl0',`r24') +define(`rl1',`r24') +define(`rl2',`r24') +define(`rl3',`r24') + +define(`t0',`r7') +define(`t1',`r8') + +define(`NAIL_BITS',`GMP_NAIL_BITS') +define(`NUMB_BITS',`GMP_NUMB_BITS') + +dnl This declaration is munged by configure +NAILS_SUPPORT(2-63) + +ASM_START() +PROLOGUE(mpn_submul_1) + sll vl0, NAIL_BITS, vl0 + lda numb_mask, -1(r31) + srl numb_mask, NAIL_BITS, numb_mask + + and n, 3, r25 + cmpeq r25, 1, r21 + bne r21, L(1m4) + cmpeq r25, 2, r21 + bne r21, L(2m4) + beq r25, L(0m4) + +L(3m4): ldq ul3, 0(up) + lda n, -4(n) + ldq ul0, 8(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 16(up) + lda up, 24(up) + lda rp, -8(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge3) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, r31, acc1 + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + br r31, L(ta3) + +L(ge3): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, r31, acc1 + umulh vl0, ul2, m2b + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + br r31, L(el3) + +L(0m4): lda n, -8(n) + ldq ul2, 0(up) + ldq ul3, 8(up) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq ul1, 24(up) + lda up, 32(up) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge4) + + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(ta4) + +L(ge4): ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + ldq ul2, 0(up) + mulq vl0, ul1, m1a + addq t0, r31, acc0 + umulh vl0, ul1, m1b + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + srl m3a,NAIL_BITS, t0 + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(el0) + +L(2m4): lda n, -4(n) + ldq ul0, 0(up) + ldq ul1, 8(up) + lda up, 16(up) + lda rp, -16(rp) + mulq vl0, ul0, m0a + umulh vl0, ul0, m0b + bge n, L(ge2) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + addq t0, r31, acc0 + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + br r31, L(ta2) + +L(ge2): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq rl0, 16(rp) + srl m0a,NAIL_BITS, t0 + ldq ul0, 16(up) + mulq vl0, ul3, m3a + addq t0, r31, acc0 + umulh vl0, ul3, m3b + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + bge n, L(el2) + + br r31, L(ta6) + +L(1m4): lda n, -4(n) + ldq ul1, 0(up) + lda up, 8(up) + lda rp, -24(rp) + bge n, L(ge1) + + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + subq rl1, t0, acc1 + and acc1,numb_mask, r28 + sra acc1,NUMB_BITS, t1 + stq r28, 24(rp) + subq m1b, t1, r0 + ret r31, (r26), 1 + +L(ge1): ldq ul2, 0(up) + mulq vl0, ul1, m1a + umulh vl0, ul1, m1b + ldq ul3, 8(up) + lda n, -4(n) + mulq vl0, ul2, m2a + umulh vl0, ul2, m2b + ldq ul0, 16(up) + mulq vl0, ul3, m3a + umulh vl0, ul3, m3b + ldq rl1, 24(rp) + srl m1a,NAIL_BITS, t0 + ldq ul1, 24(up) + lda up, 32(up) + lda rp, 32(rp) + mulq vl0, ul0, m0a + addq t0, r31, acc1 + umulh vl0, ul0, m0b + subq rl1, acc1, acc1 + ldq rl2, 0(rp) + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + sra acc1,NUMB_BITS, t1 + blt n, L(ta5) + +L(ge5): ldq ul2, 0(up) + br r31, L(el1) + + ALIGN(16) +L(top): mulq vl0, ul0, m0a C U1 + addq t0, m0b, acc1 C L0 + sra acc0,NUMB_BITS, t1 C U0 + stq r28, -24(rp) C L1 +C +L(el2): umulh vl0, ul0, m0b C U1 + and acc0,numb_mask, r28 C L0 + subq rl1, acc1, acc1 C U0 + ldq rl2, 0(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m2a,NAIL_BITS, t0 C U0 + ldq ul2, 0(up) C L1 +C + mulq vl0, ul1, m1a C U1 + addq t0, m1b, acc0 C L0 + sra acc1,NUMB_BITS, t1 C U0 + stq r28, -16(rp) C L1 +C +L(el1): umulh vl0, ul1, m1b C U1 + and acc1,numb_mask, r28 C L0 + subq rl2, acc0, acc0 C U0 + ldq rl3, 8(rp) C L1 +C + lda n, -4(n) C L1 + addq t1, acc0, acc0 C L0 + srl m3a,NAIL_BITS, t0 C U0 + ldq ul3, 8(up) C L1 +C + mulq vl0, ul2, m2a C U1 + addq t0, m2b, acc1 C L0 + sra acc0,NUMB_BITS, t1 C U0 + stq r28, -8(rp) C L1 +C +L(el0): umulh vl0, ul2, m2b C U1 + and acc0,numb_mask, r28 C L0 + subq rl3, acc1, acc1 C U0 + ldq rl0, 16(rp) C L1 +C + unop C U1 + addq t1, acc1, acc1 C L0 + srl m0a,NAIL_BITS, t0 C U0 + ldq ul0, 16(up) C L1 +C + mulq vl0, ul3, m3a C U1 + addq t0, m3b, acc0 C L0 + sra acc1,NUMB_BITS, t1 C U0 + stq r28, 0(rp) C L1 +C +L(el3): umulh vl0, ul3, m3b C U1 + and acc1,numb_mask, r28 C L0 + subq rl0, acc0, acc0 C U0 + ldq rl1, 24(rp) C L1 +C + unop C U1 + addq t1, acc0, acc0 C L0 + srl m1a,NAIL_BITS, t0 C U0 + ldq ul1, 24(up) C L1 +C + lda up, 32(up) C L0 + unop C U1 + lda rp, 32(rp) C L1 + bge n, L(top) C U0 + +L(end): mulq vl0, ul0, m0a + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, -24(rp) +L(ta6): umulh vl0, ul0, m0b + and acc0,numb_mask, r28 + subq rl1, acc1, acc1 + ldq rl2, 0(rp) + addq t1, acc1, acc1 + srl m2a,NAIL_BITS, t0 + mulq vl0, ul1, m1a + addq t0, m1b, acc0 + sra acc1,NUMB_BITS, t1 + stq r28, -16(rp) +L(ta5): umulh vl0, ul1, m1b + and acc1,numb_mask, r28 + subq rl2, acc0, acc0 + ldq rl3, 8(rp) + addq t1, acc0, acc0 + srl m3a,NAIL_BITS, t0 + addq t0, m2b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, -8(rp) + unop + ALIGN(16) +L(ta4): and acc0,numb_mask, r28 + subq rl3, acc1, acc1 + ldq rl0, 16(rp) + addq t1, acc1, acc1 + srl m0a,NAIL_BITS, t0 + addq t0, m3b, acc0 + sra acc1,NUMB_BITS, t1 + stq r28, 0(rp) + unop + ALIGN(16) +L(ta3): and acc1,numb_mask, r28 + subq rl0, acc0, acc0 + ldq rl1, 24(rp) + addq t1, acc0, acc0 + srl m1a,NAIL_BITS, t0 + addq t0, m0b, acc1 + sra acc0,NUMB_BITS, t1 + stq r28, 8(rp) + unop + ALIGN(16) +L(ta2): and acc0,numb_mask, r28 + subq rl1, acc1, acc1 + addq t1, acc1, acc1 + sra acc1,NUMB_BITS, t1 + stq r28, 16(rp) + and acc1,numb_mask, r28 + subq m1b, t1, r0 + stq r28, 24(rp) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev6/slot.pl b/gmp-6.3.0/mpn/alpha/ev6/slot.pl new file mode 100755 index 0000000..a4c8a36 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/slot.pl @@ -0,0 +1,318 @@ +#!/usr/bin/perl -w + +# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: slot.pl [filename.o]... +# +# Run "objdump" to produce a disassembly of the given object file(s) and +# annotate the output with "U" or "L" slotting which Alpha EV6 will use. +# +# When an instruction is E (ie. either U or L), an "eU" or "eL" is shown, as +# a reminder that it wasn't a fixed requirement that gave the U or L, but +# the octaword slotting rules. +# +# If an instruction is not recognised, that octaword does not get any U/L +# shown, only lower-case "u", "l" or "e" for the instructions which are +# known. Add any unknown instructions to %optable below. + + +use strict; + +# The U or L which various instructions demand, or E if either. +# +my %optable = + ( + 'addq' => 'E', + 'and' => 'E', + 'andnot' => 'E', + 'beq' => 'U', + 'bge' => 'U', + 'bgt' => 'U', + 'bic' => 'E', + 'bis' => 'E', + 'blt' => 'U', + 'bne' => 'U', + 'br' => 'L', + 'clr' => 'E', + 'cmpule' => 'E', + 'cmpult' => 'E', + 'cmpeq' => 'E', + 'cmoveq' => 'E', + 'cmovne' => 'E', + 'ctpop' => 'U', + 'ctlz' => 'U', + 'cttz' => 'U', + 'extbl' => 'U', + 'extlh' => 'U', + 'extll' => 'U', + 'extqh' => 'U', + 'extql' => 'U', + 'extwh' => 'U', + 'extwl' => 'U', + 'jsr' => 'L', + 'lda' => 'E', + 'ldah' => 'E', + 'ldbu' => 'L', + 'ldl' => 'L', + 'ldq' => 'L', + 'ldt' => 'L', + 'ret' => 'L', + 'mov' => 'E', + 'mull' => 'U', + 'mulq' => 'U', + 'negq' => 'E', + 'nop' => 'E', + 'not' => 'E', + 's8addq' => 'E', + 's8subq' => 'E', + # 'sextb' => ? + # 'sextl' => ? + 'sll' => 'U', + 'srl' => 'U', + 'stq' => 'L', + 'subq' => 'E', + 'umulh' => 'U', + 'unop' => 'E', + 'xor' => 'E', + ); + +# Slottings used for a given pattern of U/L/E in an octaword. This is as +# per the "Ebox Slotting" section of the EV6 hardware reference manual. +# +my %slottable = + ( + 'EEEE' => 'ULUL', + 'EEEL' => 'ULUL', + 'EEEU' => 'ULLU', + 'EELE' => 'ULLU', + 'EELL' => 'UULL', + 'EELU' => 'ULLU', + 'EEUE' => 'ULUL', + 'EEUL' => 'ULUL', + 'EEUU' => 'LLUU', + 'ELEE' => 'ULUL', + 'ELEL' => 'ULUL', + 'ELEU' => 'ULLU', + 'ELLE' => 'ULLU', + 'ELLL' => 'ULLL', + 'ELLU' => 'ULLU', + 'ELUE' => 'ULUL', + 'ELUL' => 'ULUL', + + 'LLLL' => 'LLLL', + 'LLLU' => 'LLLU', + 'LLUE' => 'LLUU', + 'LLUL' => 'LLUL', + 'LLUU' => 'LLUU', + 'LUEE' => 'LULU', + 'LUEL' => 'LUUL', + 'LUEU' => 'LULU', + 'LULE' => 'LULU', + 'LULL' => 'LULL', + 'LULU' => 'LULU', + 'LUUE' => 'LUUL', + 'LUUL' => 'LUUL', + 'LUUU' => 'LUUU', + 'UEEE' => 'ULUL', + 'UEEL' => 'ULUL', + 'UEEU' => 'ULLU', + + 'ELUU' => 'LLUU', + 'EUEE' => 'LULU', + 'EUEL' => 'LUUL', + 'EUEU' => 'LULU', + 'EULE' => 'LULU', + 'EULL' => 'UULL', + 'EULU' => 'LULU', + 'EUUE' => 'LUUL', + 'EUUL' => 'LUUL', + 'EUUU' => 'LUUU', + 'LEEE' => 'LULU', + 'LEEL' => 'LUUL', + 'LEEU' => 'LULU', + 'LELE' => 'LULU', + 'LELL' => 'LULL', + 'LELU' => 'LULU', + 'LEUE' => 'LUUL', + 'LEUL' => 'LUUL', + 'LEUU' => 'LLUU', + 'LLEE' => 'LLUU', + 'LLEL' => 'LLUL', + 'LLEU' => 'LLUU', + 'LLLE' => 'LLLU', + + 'UELE' => 'ULLU', + 'UELL' => 'UULL', + 'UELU' => 'ULLU', + 'UEUE' => 'ULUL', + 'UEUL' => 'ULUL', + 'UEUU' => 'ULUU', + 'ULEE' => 'ULUL', + 'ULEL' => 'ULUL', + 'ULEU' => 'ULLU', + 'ULLE' => 'ULLU', + 'ULLL' => 'ULLL', + 'ULLU' => 'ULLU', + 'ULUE' => 'ULUL', + 'ULUL' => 'ULUL', + 'ULUU' => 'ULUU', + 'UUEE' => 'UULL', + 'UUEL' => 'UULL', + 'UUEU' => 'UULU', + 'UULE' => 'UULL', + 'UULL' => 'UULL', + 'UULU' => 'UULU', + 'UUUE' => 'UUUL', + 'UUUL' => 'UUUL', + 'UUUU' => 'UUUU', + ); + +# Check all combinations of U/L/E are present in %slottable. +sub coverage { + foreach my $a ('U', 'L', 'E') { + foreach my $b ('U', 'L', 'E') { + foreach my $c ('U', 'L', 'E') { + foreach my $d ('U', 'L', 'E') { + my $x = $a . $b . $c . $d; + if (! defined $slottable{$x}) { + print "slottable missing: $x\n" + } + } + } + } + } +} + +# Certain consistency checks for %slottable. +sub check { + foreach my $x (keys %slottable) { + my $a = substr($x,0,1); + my $b = substr($x,1,1); + my $c = substr($x,2,1); + my $d = substr($x,3,1); + my $es = ($a eq 'E') + ($b eq 'E') + ($c eq 'E') + ($d eq 'E'); + my $ls = ($a eq 'L') + ($b eq 'L') + ($c eq 'L') + ($d eq 'L'); + my $us = ($a eq 'U') + ($b eq 'U') + ($c eq 'U') + ($d eq 'U'); + + my $got = $slottable{$x}; + my $want = $x; + + if ($es == 0) { + + } elsif ($es == 1) { + # when only one E, it's mapped to whichever of U or L is otherwise + # used the least + if ($ls > $us) { + $want =~ s/E/U/; + } else { + $want =~ s/E/L/; + } + } elsif ($es == 2) { + # when two E's and two U, then the E's map to L; vice versa for two E + # and two L + if ($ls == 2) { + $want =~ s/E/U/g; + } elsif ($us == 2) { + $want =~ s/E/L/g; + } else { + next; + } + } elsif ($es == 3) { + next; + + } else { # $es == 4 + next; + } + + if ($want ne $got) { + print "slottable $x want $want got $got\n"; + } + } +} + +sub disassemble { + my ($file) = @_; + + open (IN, "objdump -Srfh $file |") || die "Cannot open pipe from objdump\n"; + + my (%pre, %post, %type); + while () { + my $line = $_ . ""; + + if ($line =~ /(^[ \t]*[0-9a-f]*([0-9a-f]):[ \t]*[0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] )\t(([a-z0-9]+).*)/) { + my ($this_pre, $addr, $this_post, $opcode) = ($1, $2, $3, $4); + + my $this_type = $optable{$opcode}; + if (! defined ($this_type)) { $this_type = ' '; } + + $pre{$addr} = $this_pre; + $post{$addr} = $this_post; + $type{$addr} = $this_type; + + if ($addr eq 'c') { + my %slot = ('0'=>' ', '4'=>' ', '8'=>' ', 'c'=>' '); + + my $str = $type{'c'} . $type{'8'} . $type{'4'} . $type{'0'}; + $str = $slottable{$str}; + if (defined $str) { + $slot{'c'} = substr($str,0,1); + $slot{'8'} = substr($str,1,1); + $slot{'4'} = substr($str,2,1); + $slot{'0'} = substr($str,3,1); + } + + foreach my $i ('0', '4', '8', 'c') { + if ($slot{$i} eq $type{$i}) { $type{$i} = ' '; } + print $pre{$i}, ' ', lc($type{$i}),$slot{$i}, ' ', $post{$i}, "\n"; + } + + %pre = (); + %type = (); + %post = (); + } + } + } + + close IN || die "Error from objdump (or objdump not available)\n"; +} + +coverage(); +check(); + +my @files; +if ($#ARGV >= 0) { + @files = @ARGV; +} else { + die +} + +foreach (@files) { + disassemble($_); +} diff --git a/gmp-6.3.0/mpn/alpha/ev6/sub_n.asm b/gmp-6.3.0/mpn/alpha/ev6/sub_n.asm new file mode 100644 index 0000000..a35ba40 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev6/sub_n.asm @@ -0,0 +1,283 @@ +dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0 +dnl and store difference in a third limb vector. + +dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 5.4 +C EV6: 2.125 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C vp r18 +C n r19 +C cy r20 (for mpn_add_nc) + +C TODO +C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) +C Use multi-pronged feed-in. +C Perform additional micro-tuning + +C This code was written in cooperation with ev6 pipeline expert Steve Root. + +C Pair loads and stores where possible +C Store pairs oct-aligned where possible (didn't need it here) +C Stores are delayed every third cycle +C Loads and stores are delayed by fills +C U stays still, put code there where possible (note alternation of U1 and U0) +C L moves because of loads and stores +C Note dampers in L to limit damage + +C This odd-looking optimization expects that were having random bits in our +C data, so that a pure zero result is unlikely. so we penalize the unlikely +C case to help the common case. + +define(`u0', `r0') define(`u1', `r3') +define(`v0', `r1') define(`v1', `r4') + +define(`cy0', `r20') define(`cy1', `r21') + +MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(mpn_sub_nc) + br r31, $entry +EPILOGUE() +PROLOGUE(mpn_sub_n) + bis r31, r31, cy0 C clear carry in +$entry: cmpult r19, 5, r22 C L1 move counter + ldq u1, 0(r17) C L0 get next ones + ldq v1, 0(r18) C L1 + bne r22, $Lsmall + + ldq u0, 8(r17) C L0 get next ones + ldq v0, 8(r18) C L1 + subq u1, v1, r5 C U0 sub two data + + cmpult u1, v1, r23 C U0 did it borrow + ldq u1, 16(r17) C L0 get next ones + ldq v1, 16(r18) C L1 + + subq u0, v0, r8 C U1 sub two data + subq r5, cy0, r24 C U0 borrow in + + cmpult u0, v0, r22 C U1 did it borrow + beq r5, $fix5f C U0 fix exact zero +$ret5f: ldq u0, 24(r17) C L0 get next ones + ldq v0, 24(r18) C L1 + + subq r8, r23, r25 C U1 borrow from last + subq u1, v1, r7 C U0 sub two data + + beq r8, $fix6f C U1 fix exact zero +$ret6f: cmpult u1, v1, r23 C U0 did it borrow + ldq u1, 32(r17) C L0 get next ones + ldq v1, 32(r18) C L1 + + lda r17, 40(r17) C L0 move pointer + lda r18, 40(r18) C L1 move pointer + + lda r16, -8(r16) + lda r19, -13(r19) C L1 move counter + blt r19, $Lend C U1 loop control + + +C Main loop. 8-way unrolled. + ALIGN(16) +$Loop: subq u0, v0, r2 C U1 sub two data + stq r24, 8(r16) C L0 put an answer + subq r7, r22, r24 C U0 borrow from last + stq r25, 16(r16) C L1 pair + + cmpult u0, v0, cy1 C U1 did it borrow + beq r7, $fix7 C U0 fix exact 0 +$ret7: ldq u0, 0(r17) C L0 get next ones + ldq v0, 0(r18) C L1 + + bis r31, r31, r31 C L damp out + subq r2, r23, r25 C U1 borrow from last + bis r31, r31, r31 C L moves in L ! + subq u1, v1, r5 C U0 sub two data + + beq r2, $fix0 C U1 fix exact zero +$ret0: cmpult u1, v1, cy0 C U0 did it borrow + ldq u1, 8(r17) C L0 get next ones + ldq v1, 8(r18) C L1 + + subq u0, v0, r8 C U1 sub two data + stq r24, 24(r16) C L0 store pair + subq r5, cy1, r24 C U0 borrow from last + stq r25, 32(r16) C L1 + + cmpult u0, v0, r22 C U1 did it borrow + beq r5, $fix1 C U0 fix exact zero +$ret1: ldq u0, 16(r17) C L0 get next ones + ldq v0, 16(r18) C L1 + + lda r16, 64(r16) C L0 move pointer + subq r8, cy0, r25 C U1 borrow from last + lda r19, -8(r19) C L1 move counter + subq u1, v1, r7 C U0 sub two data + + beq r8, $fix2 C U1 fix exact zero +$ret2: cmpult u1, v1, r23 C U0 did it borrow + ldq u1, 24(r17) C L0 get next ones + ldq v1, 24(r18) C L1 + + subq u0, v0, r2 C U1 sub two data + stq r24, -24(r16) C L0 put an answer + subq r7, r22, r24 C U0 borrow from last + stq r25, -16(r16) C L1 pair + + cmpult u0, v0, cy1 C U1 did it borrow + beq r7, $fix3 C U0 fix exact 0 +$ret3: ldq u0, 32(r17) C L0 get next ones + ldq v0, 32(r18) C L1 + + bis r31, r31, r31 C L damp out + subq r2, r23, r25 C U1 borrow from last + bis r31, r31, r31 C L moves in L ! + subq u1, v1, r5 C U0 sub two data + + beq r2, $fix4 C U1 fix exact zero +$ret4: cmpult u1, v1, cy0 C U0 did it borrow + ldq u1, 40(r17) C L0 get next ones + ldq v1, 40(r18) C L1 + + subq u0, v0, r8 C U1 sub two data + stq r24, -8(r16) C L0 store pair + subq r5, cy1, r24 C U0 borrow from last + stq r25, 0(r16) C L1 + + cmpult u0, v0, r22 C U1 did it borrow + beq r5, $fix5 C U0 fix exact zero +$ret5: ldq u0, 48(r17) C L0 get next ones + ldq v0, 48(r18) C L1 + + ldl r31, 256(r17) C L0 prefetch + subq r8, cy0, r25 C U1 borrow from last + ldl r31, 256(r18) C L1 prefetch + subq u1, v1, r7 C U0 sub two data + + beq r8, $fix6 C U1 fix exact zero +$ret6: cmpult u1, v1, r23 C U0 did it borrow + ldq u1, 56(r17) C L0 get next ones + ldq v1, 56(r18) C L1 + + lda r17, 64(r17) C L0 move pointer + bis r31, r31, r31 C U + lda r18, 64(r18) C L1 move pointer + bge r19, $Loop C U1 loop control +C ==== main loop end + +$Lend: subq u0, v0, r2 C U1 sub two data + stq r24, 8(r16) C L0 put an answer + subq r7, r22, r24 C U0 borrow from last + stq r25, 16(r16) C L1 pair + cmpult u0, v0, cy1 C U1 did it borrow + beq r7, $fix7c C U0 fix exact 0 +$ret7c: subq r2, r23, r25 C U1 borrow from last + subq u1, v1, r5 C U0 sub two data + beq r2, $fix0c C U1 fix exact zero +$ret0c: cmpult u1, v1, cy0 C U0 did it borrow + stq r24, 24(r16) C L0 store pair + subq r5, cy1, r24 C U0 borrow from last + stq r25, 32(r16) C L1 + beq r5, $fix1c C U0 fix exact zero +$ret1c: stq r24, 40(r16) C L0 put an answer + lda r16, 48(r16) C L0 move pointer + + lda r19, 8(r19) + beq r19, $Lret + + ldq u1, 0(r17) + ldq v1, 0(r18) +$Lsmall: + lda r19, -1(r19) + beq r19, $Lend0 + + ALIGN(8) +$Loop0: subq u1, v1, r2 C main sub + cmpult u1, v1, r8 C compute bw from last sub + ldq u1, 8(r17) + ldq v1, 8(r18) + subq r2, cy0, r5 C borrow sub + lda r17, 8(r17) + lda r18, 8(r18) + stq r5, 0(r16) + cmpult r2, cy0, cy0 C compute bw from last sub + lda r19, -1(r19) C decr loop cnt + bis r8, cy0, cy0 C combine bw from the two subs + lda r16, 8(r16) + bne r19, $Loop0 +$Lend0: subq u1, v1, r2 C main sub + subq r2, cy0, r5 C borrow sub + cmpult u1, v1, r8 C compute bw from last sub + cmpult r2, cy0, cy0 C compute bw from last sub + stq r5, 0(r16) + bis r8, cy0, r0 C combine bw from the two subs + ret r31,(r26),1 + + ALIGN(8) +$Lret: lda r0, 0(cy0) C copy borrow into return register + ret r31,(r26),1 + +$fix5f: bis r23, cy0, r23 C bring forward borrow + br r31, $ret5f +$fix6f: bis r22, r23, r22 C bring forward borrow + br r31, $ret6f +$fix0: bis cy1, r23, cy1 C bring forward borrow + br r31, $ret0 +$fix1: bis cy0, cy1, cy0 C bring forward borrow + br r31, $ret1 +$fix2: bis r22, cy0, r22 C bring forward borrow + br r31, $ret2 +$fix3: bis r23, r22, r23 C bring forward borrow + br r31, $ret3 +$fix4: bis cy1, r23, cy1 C bring forward borrow + br r31, $ret4 +$fix5: bis cy1, cy0, cy0 C bring forward borrow + br r31, $ret5 +$fix6: bis r22, cy0, r22 C bring forward borrow + br r31, $ret6 +$fix7: bis r23, r22, r23 C bring forward borrow + br r31, $ret7 +$fix0c: bis cy1, r23, cy1 C bring forward borrow + br r31, $ret0c +$fix1c: bis cy0, cy1, cy0 C bring forward borrow + br r31, $ret1c +$fix7c: bis r23, r22, r23 C bring forward borrow + br r31, $ret7c + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev67/gcd_11.asm b/gmp-6.3.0/mpn/alpha/ev67/gcd_11.asm new file mode 100644 index 0000000..03c234b --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev67/gcd_11.asm @@ -0,0 +1,79 @@ +dnl Alpha ev67 mpn_gcd_11 -- Nx1 greatest common divisor. + +dnl Copyright 2003, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C ev67: 3.4 cycles/bitpair for 1x1 part + + +C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); +C +C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and +C strip trailing zeros from abs(x-y) to maintain x and y both odd. +C +C The trailing zeros are calculated from just x-y, since in twos-complement +C there's the same number of trailing zeros on d or -d. This means the cttz +C runs in parallel with abs(x-y). +C +C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit +C operands with this algorithm gives the measured 3.4 c/l. +C +C The slottings shown are for SVR4 style systems, Unicos differs in the +C initial gp setup and the LEA. + + +ASM_START() +PROLOGUE(mpn_gcd_11) + mov r16, r0 + mov r17, r1 + + ALIGN(16) +L(top): subq r0, r1, r7 C l0 d = x - y + cmpult r0, r1, r16 C u0 test x >= y + + subq r1, r0, r4 C l0 new_x = y - x + cttz r7, r8 C U0 d twos + + cmoveq r16, r7, r4 C l0 new_x = d if x>=y + cmovne r16, r0, r1 C u0 y = x if x> twos + bne r7, L(top) C U1 stop when d==0 + + +L(end): mov r1, r0 C U0 return y << common_twos + ret r31, (r26), 1 C L0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev67/hamdist.asm b/gmp-6.3.0/mpn/alpha/ev67/hamdist.asm new file mode 100644 index 0000000..4b13e9f --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev67/hamdist.asm @@ -0,0 +1,111 @@ +dnl Alpha ev67 mpn_hamdist -- mpn hamming distance. + +dnl Copyright 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C ev67: 2.5 cycles/limb + + +C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size); +C +C The hope was for 2.0 c/l here, but that isn't achieved. We're limited by +C renaming register shortage. Since we need 5 instructions per limb, further +C unrolling could approach 1.5 c/l. +C +C The main loop processes two limbs from each operand on each iteration. An +C odd size is handled by processing xp[0]^yp[0] at the start. If the size +C is even that result is discarded, and is repeated by the main loop. +C + +ASM_START() +PROLOGUE(mpn_hamdist) + + C r16 xp + C r17 yp + C r18 size + + ldq r1, 0(r16) C L0 xp[0] + ldq r2, 0(r17) C L1 yp[0] + and r18, 1, r8 C U1 1 if size odd + srl r18, 1, r18 C U0 size, limb pairs + + clr r0 C L0 initial total + s8addq r8, r17, r17 C U1 yp++ if size odd + s8addq r8, r16, r16 C L1 xp++ if size odd + clr r6 C U0 dummy initial xor 1 + + xor r1, r2, r5 C L initial xor 0 + beq r18, L(one) C U if size==1 + + cmoveq r8, r31, r5 C L discard first limb if size even + unop C U + + + ALIGN(16) +L(top): + C r0 total accumulating + C r7 xor 0 + C r8 xor 1 + C r16 xp, incrementing + C r17 yp, incrementing + C r18 size, limb pairs, decrementing + + ldq r1, 0(r16) C L + ldq r2, 0(r17) C L + ctpop r5, r7 C U0 + lda r16, 16(r16) C U + + ldq r3, -8(r16) C L + ldq r4, 8(r17) C L + ctpop r6, r8 C U0 + lda r17, 16(r17) C U + + ldl r31, 256(r16) C L prefetch + ldl r31, 256(r17) C L prefetch + xor r1, r2, r5 C U + lda r18, -1(r18) C U + + xor r3, r4, r6 C U + addq r0, r7, r0 C L + addq r0, r8, r0 C L + bne r18, L(top) C U + + + ctpop r6, r8 C U0 + addq r0, r8, r0 C L +L(one): + ctpop r5, r7 C U0 + addq r0, r7, r0 C L + + ret r31, (r26), 1 C L0 + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/ev67/popcount.asm b/gmp-6.3.0/mpn/alpha/ev67/popcount.asm new file mode 100644 index 0000000..049c1cd --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/ev67/popcount.asm @@ -0,0 +1,101 @@ +dnl Alpha ev67 mpn_popcount -- mpn bit population count. + +dnl Copyright 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C ev67: 1.5 cycles/limb + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C +C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide +C all latencies, the addq's must be deferred to the next iteration. +C +C Since we need just 3 instructions per limb, further unrolling could approach +C 1.0 c/l. +C +C The main loop processes two limbs at a time. An odd size is handled by +C processing src[0] at the start. If the size is even that result is +C discarded, and src[0] is repeated by the main loop. +C + +ASM_START() +PROLOGUE(mpn_popcount) + + C r16 src + C r17 size + + ldq r0, 0(r16) C L0 src[0] + and r17, 1, r8 C U1 1 if size odd + srl r17, 1, r17 C U0 size, limb pairs + + s8addq r8, r16, r16 C L1 src++ if size odd + ctpop r0, r0 C U0 + beq r17, L(one) C U1 if size==1 + + cmoveq r8, r31, r0 C L discard first limb if size even + clr r3 C L + + clr r4 C L + unop C U + unop C L + unop C U + + + ALIGN(16) +L(top): + C r0 total accumulating + C r3 pop 0 + C r4 pop 1 + C r16 src, incrementing + C r17 size, decrementing + + ldq r1, 0(r16) C L + ldq r2, 8(r16) C L + lda r16, 16(r16) C U + lda r17, -1(r17) C U + + addq r0, r3, r0 C L + addq r0, r4, r0 C L + ctpop r1, r3 C U0 + ctpop r2, r4 C U0 + + ldl r31, 512(r16) C L prefetch + bne r17, L(top) C U + + + addq r0, r3, r0 C L + addq r0, r4, r0 C U +L(one): + ret r31, (r26), 1 C L0 + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/gmp-mparam.h b/gmp-6.3.0/mpn/alpha/gmp-mparam.h new file mode 100644 index 0000000..b850bd2 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/gmp-mparam.h @@ -0,0 +1,86 @@ +/* Alpha EV4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2009 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + + +/* 175MHz 21064 */ + +/* Generated by tuneup.c, 2009-01-15, gcc 3.2 */ + +#define MUL_TOOM22_THRESHOLD 12 +#define MUL_TOOM33_THRESHOLD 69 +#define MUL_TOOM44_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 62 +#define SQR_TOOM4_THRESHOLD 155 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 40 +#define MULLO_MUL_N_THRESHOLD 202 + +#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */ +#define DIV_DC_THRESHOLD 38 +#define POWM_THRESHOLD 60 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD_THRESHOLD 80 +#define GCD_DC_THRESHOLD 237 +#define GCDEXT_DC_THRESHOLD 198 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1_THRESHOLD 2 +#define MOD_1_2_THRESHOLD 9 +#define MOD_1_4_THRESHOLD 20 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define USE_PREINV_MOD_1 1 /* preinv always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 37 +#define SET_STR_DC_THRESHOLD 746 +#define SET_STR_PRECOMPUTE_THRESHOLD 1332 + +#define MUL_FFT_TABLE { 240, 480, 1344, 2304, 5120, 20480, 49152, 0 } +#define MUL_FFT_MODF_THRESHOLD 232 +#define MUL_FFT_THRESHOLD 1664 + +#define SQR_FFT_TABLE { 240, 480, 1216, 2304, 5120, 12288, 49152, 0 } +#define SQR_FFT_MODF_THRESHOLD 232 +#define SQR_FFT_THRESHOLD 1408 diff --git a/gmp-6.3.0/mpn/alpha/invert_limb.asm b/gmp-6.3.0/mpn/alpha/invert_limb.asm new file mode 100644 index 0000000..afc010f --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/invert_limb.asm @@ -0,0 +1,95 @@ +dnl Alpha mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 1996, 2000-2003, 2007, 2011, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 137/140 (with BWX/without BWX) +C EV6: 71/72 (with BWX/without BWX) + +C This was compiler generated, with minimal manual edits. Surely several +C cycles could be cut with some thought. + +ASM_START() +PROLOGUE(mpn_invert_limb,gp) + LEA( r2, approx_tab) + srl r16, 54, r1 + srl r16, 24, r4 + and r16, 1, r5 + bic r1, 1, r7 + lda r4, 1(r4) + srl r16, 1, r3 + addq r7, r2, r1 +ifelse(bwx_available_p,1,` + ldwu r0, -512(r1) +',` + ldq_u r0, -512(r1) + extwl r0, r7, r0 +') + addq r3, r5, r3 + mull r0, r0, r1 + sll r0, 11, r0 + mulq r1, r4, r1 + srl r1, 40, r1 + subq r0, r1, r0 + lda r0, -1(r0) + mulq r0, r0, r2 + sll r0, 60, r1 + sll r0, 13, r0 + mulq r2, r4, r2 + subq r1, r2, r1 + srl r1, 47, r1 + addq r0, r1, r0 + mulq r0, r3, r3 + srl r0, 1, r1 + cmoveq r5, 0, r1 + subq r1, r3, r1 + umulh r1, r0, r3 + sll r0, 31, r0 + srl r3, 1, r1 + addq r0, r1, r0 + mulq r0, r16, r2 + umulh r0, r16, r3 + addq r2, r16, r1 + addq r3, r16, r16 + cmpult r1, r2, r1 + addq r16, r1, r3 + subq r0, r3, r0 + ret r31, (r26), 1 +EPILOGUE() +DATASTART(approx_tab,8) +forloop(i,256,512-1,dnl +` .word eval(0x7fd00/i) +')dnl + SIZE(approx_tab, 512) + TYPE(approx_tab, object) +DATAEND() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/lshift.asm b/gmp-6.3.0/mpn/alpha/lshift.asm new file mode 100644 index 0000000..c62a856 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/lshift.asm @@ -0,0 +1,182 @@ +dnl Alpha mpn_lshift -- Shift a number left. + +dnl Copyright 1994, 1995, 2000, 2003, 2009 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 3.25 +C EV6: 1.75 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C n r18 +C cnt r19 + + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r31,r19,r20 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + srl r4,r20,r0 C compute function result + + beq r28,L(L0) + subq r18,r28,r18 + + ALIGN(8) +L(top0): + ldq r3,-16(r17) + subq r16,8,r16 + sll r4,r19,r5 + subq r17,8,r17 + subq r28,1,r28 + srl r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r28,L(top0) + +L(L0): sll r4,r19,r24 + beq r18,L(end) +C warm up phase 1 + ldq r1,-16(r17) + subq r18,4,r18 + ldq r2,-24(r17) + ldq r3,-32(r17) + ldq r4,-40(r17) +C warm up phase 2 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + beq r18,L(end1) + ldq r1,-48(r17) + sll r2,r19,r22 + ldq r2,-56(r17) + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + ldq r3,-64(r17) + sll r4,r19,r24 + ldq r4,-72(r17) + subq r18,4,r18 + beq r18,L(end2) + ALIGN(16) +C main loop +L(top): stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + + srl r1,r20,r7 + subq r18,4,r18 + sll r1,r19,r21 + unop C ldq r31,-96(r17) + + srl r2,r20,r8 + ldq r1,-80(r17) + sll r2,r19,r22 + ldq r2,-88(r17) + + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + + srl r3,r20,r5 + unop C ldq r31,-96(r17) + sll r3,r19,r23 + subq r16,32,r16 + + srl r4,r20,r6 + ldq r3,-96(r17) + sll r4,r19,r24 + ldq r4,-104(r17) + + subq r17,32,r17 + bne r18,L(top) +C cool down phase 2/1 +L(end2): + stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + srl r3,r20,r5 + sll r3,r19,r23 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 2/2 + stq r7,-40(r16) + bis r5,r22,r5 + stq r8,-48(r16) + bis r6,r23,r6 + stq r5,-56(r16) + stq r6,-64(r16) +C cool down phase 2/3 + stq r24,-72(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +L(end1): + sll r2,r19,r22 + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 1/2 + stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + stq r5,-24(r16) + stq r6,-32(r16) + stq r24,-40(r16) + ret r31,(r26),1 + +L(end): stq r24,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/mod_34lsub1.asm b/gmp-6.3.0/mpn/alpha/mod_34lsub1.asm new file mode 100644 index 0000000..1b03b63 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/mod_34lsub1.asm @@ -0,0 +1,164 @@ +dnl Alpha mpn_mod_34lsub1. + +dnl Copyright 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 4 (?) +C EV5: 2.67 +C EV6: 1.67 + + +dnl INPUT PARAMETERS +dnl up r16 +dnl n r17 + +define(`l0',`r18') +define(`l1',`r19') +define(`l2',`r20') +define(`a0',`r21') +define(`a1',`r22') +define(`a2',`r23') +define(`c0',`r24') +define(`c1',`r5') +define(`c2',`r6') + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + bis r31, r31, c0 + bis r31, r31, c1 + bis r31, r31, c2 + + lda r17, -3(r17) + bge r17, $L_3_or_more + bis r31, r31, a0 + bis r31, r31, a1 + bis r31, r31, a2 + br r31, $L_012 + +$L_3_or_more: + ldq a0, 0(r16) + ldq a1, 8(r16) + ldq a2, 16(r16) + lda r16, 24(r16) + lda r17, -3(r17) + blt r17, $L_012 + +$L_6_or_more: + ldq l0, 0(r16) + ldq l1, 8(r16) + ldq l2, 16(r16) + addq l0, a0, a0 + + lda r16, 24(r16) + lda r17, -3(r17) + blt r17, $L_end + + ALIGN(16) +C Main loop +$L_9_or_more: +$Loop: cmpult a0, l0, r0 + ldq l0, 0(r16) + addq r0, c0, c0 + addq l1, a1, a1 + cmpult a1, l1, r0 + ldq l1, 8(r16) + addq r0, c1, c1 + addq l2, a2, a2 + cmpult a2, l2, r0 + ldq l2, 16(r16) + addq r0, c2, c2 + addq l0, a0, a0 + lda r16, 24(r16) + lda r17, -3(r17) + bge r17, $Loop + +$L_end: cmpult a0, l0, r0 + addq r0, c0, c0 + addq l1, a1, a1 + cmpult a1, l1, r0 + addq r0, c1, c1 + addq l2, a2, a2 + cmpult a2, l2, r0 + addq r0, c2, c2 + +C Handle the last (n mod 3) limbs +$L_012: lda r17, 2(r17) + blt r17, $L_0 + ldq l0, 0(r16) + addq l0, a0, a0 + cmpult a0, l0, r0 + addq r0, c0, c0 + beq r17, $L_0 + ldq l1, 8(r16) + addq l1, a1, a1 + cmpult a1, l1, r0 + addq r0, c1, c1 + +C Align and sum our 3 main accumulators and 3 carry accumulators +$L_0: srl a0, 48, r2 + srl a1, 32, r4 +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` insll a1, 2, r1', C (a1 & 0xffffffff) << 16 +` zapnot a1, 15, r25 + sll r25, 16, r1') + zapnot a0, 63, r0 C a0 & 0xffffffffffff + srl a2, 16, a1 +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` inswl a2, 4, r3', C (a2 & 0xffff) << 32 +` zapnot a2, 3, r25 + sll r25, 32, r3') + addq r1, r4, r1 + addq r0, r2, r0 + srl c0, 32, a2 +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` insll c0, 2, r4', C (c0 & 0xffffffff) << 16 +` zapnot c0, 15, r25 + sll r25, 16, r4') + addq r0, r1, r0 + addq r3, a1, r3 + addq r0, r3, r0 + srl c1, 16, c0 +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` inswl c1, 4, r2', C (c1 & 0xffff) << 32 +` zapnot c1, 3, r25 + sll r25, 32, r2') + addq r4, a2, r4 +C srl c2, 48, r3 C This will be 0 in practise + zapnot c2, 63, r1 C r1 = c2 & 0xffffffffffff + addq r0, r4, r0 + addq r2, c0, r2 + addq r0, r2, r0 +C addq r1, r3, r1 + addq r0, r1, r0 + + ret r31, (r26), 1 +EPILOGUE(mpn_mod_34lsub1) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/mode1o.asm b/gmp-6.3.0/mpn/alpha/mode1o.asm new file mode 100644 index 0000000..96dccc7 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/mode1o.asm @@ -0,0 +1,209 @@ +dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder + +dnl Copyright 2003, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C EV4: 47 +C EV5: 30 +C EV6: 15 + + +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, +C mp_limb_t c) +C +C This code follows the "alternate" code in mpn/generic/mode1o.c, +C eliminating cbit+climb from the dependent chain. This leaves, +C +C ev4 ev5 ev6 +C 1 3 1 subq y = x - h +C 23 13 7 mulq q = y * inverse +C 23 14 7 umulh h = high (q * d) +C -- -- -- +C 47 30 15 +C +C In each case, the load latency, loop control, and extra carry bit handling +C hide under the multiply latencies. Those latencies are long enough that +C we don't need to worry about alignment or pairing to squeeze out +C performance. +C +C For the first limb, some of the loop code is broken out and scheduled back +C since it can be done earlier. +C +C - The first ldq src[0] is near the start of the routine, for maximum +C time from memory. +C +C - The subq y=x-climb can be done without waiting for the inverse. +C +C - The mulq y*inverse is replicated after the final subq for the inverse, +C instead of branching to the mulq in the main loop. On ev4 a branch +C there would cost cycles, but we can hide them under the mulq latency. +C +C For the last limb, high> 1 + + and r20, 127, r20 C idx = d>>1 & 0x7F + + addq r0, r20, r21 C table + idx + +ifelse(bwx_available_p,1, +` ldbu r20, 0(r21) C table[idx], inverse 8 bits +',` + ldq_u r20, 0(r21) C table[idx] qword + extbl r20, r21, r20 C table[idx], inverse 8 bits +') + + mull r20, r20, r7 C i*i + addq r20, r20, r20 C 2*i + + ldq r2, 0(r16) C x = s = src[0] + lda r17, -1(r17) C size-- + clr r0 C initial cbit=0 + + mull r7, r18, r7 C i*i*d + + subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits + + mull r20, r20, r7 C i*i + addq r20, r20, r20 C 2*i + + mull r7, r18, r7 C i*i*d + + subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits + + mulq r20, r20, r7 C i*i + addq r20, r20, r20 C 2*i + + mulq r7, r18, r7 C i*i*d + subq r2, r19, r3 C y = x - climb + + subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits + +ASSERT(r7, C should have d*inv==1 mod 2^64 +` mulq r18, r20, r7 + cmpeq r7, 1, r7') + + mulq r3, r20, r4 C first q = y * inv + + beq r17, L(one) C if size==1 + br L(entry) + + +L(top): + C r0 cbit + C r16 src, incrementing + C r17 size, decrementing + C r18 d + C r19 climb + C r20 inv + + ldq r1, 0(r16) C s = src[i] + subq r1, r0, r2 C x = s - cbit + cmpult r1, r0, r0 C new cbit = s < cbit + + subq r2, r19, r3 C y = x - climb + + mulq r3, r20, r4 C q = y * inv +L(entry): + cmpult r2, r19, r5 C cbit2 = x < climb + addq r5, r0, r0 C cbit += cbit2 + lda r16, 8(r16) C src++ + lda r17, -1(r17) C size-- + + umulh r4, r18, r19 C climb = q * d + bne r17, L(top) C while 2 or more limbs left + + + + C r0 cbit + C r18 d + C r19 climb + C r20 inv + + ldq r1, 0(r16) C s = src[size-1] high limb + + cmpult r1, r18, r2 C test high 0 +dnl and store difference in a third limb vector. + +dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: ? +C EV5: 4.75 +C EV6: 3 + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_nc) + bis r31,r20,r25 + br L(com) +EPILOGUE() +PROLOGUE(mpn_sub_n) + bis r31,r31,r25 C clear cy +L(com): subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + subq r4,r0,r28 C 1st main subtract + ldq r2,16(r18) + subq r28,r25,r20 C 1st carry subtract + ldq r3,24(r18) + cmpult r4,r0,r8 C compute cy from last subtract + ldq r6,-16(r17) + cmpult r28,r25,r25 C compute cy from last subtract + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two subtracts + subq r19,4,r19 C decr loop cnt + subq r5,r1,r28 C 2nd main subtract + addq r18,32,r18 C update s2_ptr + subq r28,r25,r21 C 2nd carry subtract + cmpult r5,r1,r8 C compute cy from last subtract + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r28,r25,r25 C compute cy from last subtract + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two subtracts + ldq r1,8(r18) + subq r6,r2,r28 C 3rd main subtract + ldq r4,0(r17) + subq r28,r25,r22 C 3rd carry subtract + ldq r5,8(r17) + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C 4th main subtract + subq r28,r25,r23 C 4th carry subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + subq r4,r0,r28 C 1st main subtract + ldq r2,16(r18) + subq r28,r25,r20 C 1st carry subtract + ldq r3,24(r18) + cmpult r4,r0,r8 C compute cy from last subtract + ldq r6,-16(r17) + cmpult r28,r25,r25 C compute cy from last subtract + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two subtracts + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + subq r5,r1,r28 C 2nd main subtract + stq r23,-8(r16) + subq r28,r25,r21 C 2nd carry subtract + addq r18,32,r18 C update s2_ptr + cmpult r5,r1,r8 C compute cy from last subtract + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + subq r6,r2,r28 C cy add + subq r28,r25,r22 C 3rd main subtract + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C cy add + subq r28,r25,r23 C 4th main subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: subq r4,r0,r28 C main subtract + cmpult r4,r0,r8 C compute cy from last subtract + ldq r0,8(r18) + ldq r4,8(r17) + subq r28,r25,r20 C carry subtract + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r28,r25,r25 C compute cy from last subtract + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: subq r4,r0,r28 C main subtract + subq r28,r25,r20 C carry subtract + cmpult r4,r0,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/submul_1.asm b/gmp-6.3.0/mpn/alpha/submul_1.asm new file mode 100644 index 0000000..2b63b52 --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/submul_1.asm @@ -0,0 +1,99 @@ +dnl Alpha mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C EV4: 42 +C EV5: 18 +C EV6: 7 + +C INPUT PARAMETERS +C rp r16 +C up r17 +C n r18 +C limb r19 + + +ASM_START() +PROLOGUE(mpn_submul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + subq r5,r3,r3 + cmpult r5,r3,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_submul_1) +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/umul.asm b/gmp-6.3.0/mpn/alpha/umul.asm new file mode 100644 index 0000000..039081e --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/umul.asm @@ -0,0 +1,44 @@ +dnl mpn_umul_ppmm -- 1x1->2 limb multiplication + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2); +C + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + mulq r17, r18, r1 + umulh r17, r18, r0 + stq r1, 0(r16) + ret r31, (r26), 1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/alpha/unicos.m4 b/gmp-6.3.0/mpn/alpha/unicos.m4 new file mode 100644 index 0000000..e05cf5c --- /dev/null +++ b/gmp-6.3.0/mpn/alpha/unicos.m4 @@ -0,0 +1,131 @@ +divert(-1) + +dnl m4 macros for alpha assembler on unicos. + + +dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Note that none of the standard GMP_ASM_ autoconf tests are done for +dnl unicos, so none of the config.m4 results can be used here. + +dnl No underscores on unicos +define(`GSYM_PREFIX') + +define(`ASM_START', +m4_assert_numargs(0) +` .ident dummy') + +define(`X', +m4_assert_numargs(1) +`^X$1') + +define(`FLOAT64', +m4_assert_numargs(2) +` .psect $1@crud,data +$1: .t_floating $2 + .endp') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',gp,, +`ifelse(`$2',noalign,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter +')')')')dnl + .stack 192 ; What does this mean? Only Cray knows. + .psect $1@code,code,cache +$1::') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .endp') + + +dnl Usage: LDGP(dst,src) +dnl +dnl Emit an "ldgp dst,src", but only on systems using a GOT (which unicos +dnl doesn't). + +define(LDGP, +m4_assert_numargs(2) +) + + +dnl Usage: EXTERN(variable_name) +define(`EXTERN', +m4_assert_numargs(1) +` .extern $1') + +define(`DATASTART', +m4_assert_numargs_range(1,2) +` .psect $1@crud,data + ALIGN(ifelse($#,1,2,$2)) +$1:') + +define(`DATAEND', +m4_assert_numargs(0) +` .endp') + +define(`ASM_END', +m4_assert_numargs(0) +` .end') + +define(`cvttqc', +m4_assert_numargs(-1) +`cvttq/c') + +dnl Load a symbolic address into a register +define(`LEA', +m4_assert_numargs(2) + `laum $1, $2(r31) + sll $1, 32, $1 + lalm $1, $2($1) + lal $1, $2($1)') + + +dnl Usage: ALIGN(bytes) +dnl +dnl Unicos assembler .align emits zeros, even in code segments, so disable +dnl aligning. +dnl +dnl GCC uses a macro emiting nops until the desired alignment is reached +dnl (see unicosmk_file_start in alpha.c). Could do something like that if +dnl we cared. The maximum desired alignment must be established at the +dnl start of the section though, since of course emitting nops only +dnl advances relative to the section beginning. + +define(`ALIGN', +m4_assert_numargs(1) +) + + +divert diff --git a/gmp-6.3.0/mpn/and_n.c b/gmp-6.3.0/mpn/and_n.c new file mode 120000 index 0000000..0a553d9 --- /dev/null +++ b/gmp-6.3.0/mpn/and_n.c @@ -0,0 +1 @@ +../mpn/generic/logops_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/andn_n.c b/gmp-6.3.0/mpn/andn_n.c new file mode 120000 index 0000000..0a553d9 --- /dev/null +++ b/gmp-6.3.0/mpn/andn_n.c @@ -0,0 +1 @@ +../mpn/generic/logops_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/arm/README b/gmp-6.3.0/mpn/arm/README new file mode 100644 index 0000000..53c7214 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/README @@ -0,0 +1,35 @@ +Copyright 2002, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions for ARM processors. It has been +optimised mainly for Cortex-A9 and Cortex-A15, but the code in the top-level +directory should run on all ARM processors at architecture level v4 or later. diff --git a/gmp-6.3.0/mpn/arm/aors_n.asm b/gmp-6.3.0/mpn/arm/aors_n.asm new file mode 100644 index 0000000..fdad9f7 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/aors_n.asm @@ -0,0 +1,112 @@ +dnl ARM mpn_add_n and mpn_sub_n + +dnl Contributed to the GNU project by Robert Harley. + +dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 2.5 slightly fluctuating +C Cortex-A15 2.25 + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`CLRCY', `cmn r0, #0') + define(`SETCY', `cmp $1, #1') + define(`RETVAL', `adc r0, n, #0') + define(`func', mpn_add_n) + define(`func_nc', mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`CLRCY', `cmp r0, r0') + define(`SETCY', `rsbs $1, $1, #0') + define(`RETVAL', `sbc r0, r0, r0 + and r0, r0, #1') + define(`func', mpn_sub_n) + define(`func_nc', mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + ldr r12, [sp, #0] + stmfd sp!, { r8, r9, lr } + SETCY( r12) + b L(ent) +EPILOGUE() +PROLOGUE(func) + stmfd sp!, { r8, r9, lr } + CLRCY( r12) +L(ent): tst n, #1 + beq L(skip1) + ldr r12, [up], #4 + ldr lr, [vp], #4 + ADDSUBC r12, r12, lr + str r12, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmia up!, { r8, r9 } + ldmia vp!, { r12, lr } + ADDSUBC r8, r8, r12 + ADDSUBC r9, r9, lr + stmia rp!, { r8, r9 } +L(skip2): + bics n, n, #3 + beq L(rtn) + stmfd sp!, { r4, r5, r6, r7 } + +L(top): ldmia up!, { r4, r5, r6, r7 } + ldmia vp!, { r8, r9, r12, lr } + ADDSUBC r4, r4, r8 + sub n, n, #4 + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, r12 + ADDSUBC r7, r7, lr + stmia rp!, { r4, r5, r6, r7 } + teq n, #0 + bne L(top) + + ldmfd sp!, { r4, r5, r6, r7 } + +L(rtn): RETVAL + ldmfd sp!, { r8, r9, pc } +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/aorslsh1_n.asm b/gmp-6.3.0/mpn/arm/aorslsh1_n.asm new file mode 100644 index 0000000..889e654 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/aorslsh1_n.asm @@ -0,0 +1,167 @@ +dnl ARM mpn_addlsh1_n and mpn_sublsh1_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C addlsh1_n sublsh1_n +C cycles/limb cycles/limb +C StrongARM ? ? +C XScale ? ? +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3.12 3.7 +C Cortex-A15 ? ? + +C TODO +C * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1. +C The sublsh1_n code could surely be tweaked, its REVCY slows down things +C very much. If two insns are really needed, it might help to separate them +C for better micro-parallelism. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_addlsh1_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`SETCY', `cmp $1, #1') + define(`RETVAL', `adc r0, $1, #2') + define(`SAVECY', `sbc $1, $2, #0') + define(`RESTCY', `cmn $1, #1') + define(`REVCY', `') + define(`INICYR', `mov $1, #0') + define(`r10r11', `r11') + define(`func', mpn_addlsh1_n) + define(`func_nc', mpn_addlsh1_nc)') +ifdef(`OPERATION_sublsh1_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`SETCY', `rsbs $1, $1, #0') + define(`RETVAL', `adc r0, $1, #1') + define(`SAVECY', `sbc $1, $1, $1') + define(`RESTCY', `cmn $1, #1') + define(`REVCY', `sbc $1, $1, $1 + cmn $1, #1') + define(`INICYR', `mvn $1, #0') + define(`r10r11', `r10') + define(`func', mpn_sublsh1_n) + define(`func_nc', mpn_sublsh1_nc)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + +ASM_START() +PROLOGUE(func) + push {r4-r10r11, r14} + +ifdef(`OPERATION_addlsh1_n', ` + mvn r11, #0 +') + INICYR( r14) + subs n, n, #3 + blt L(le2) C carry clear on branch path + + cmn r0, #0 C clear carry + ldmia vp!, {r8, r9, r10} + b L(mid) + +L(top): RESTCY( r14) + ADDSUBC r4, r4, r8 + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, r10 + ldmia vp!, {r8, r9, r10} + stmia rp!, {r4, r5, r6} + REVCY(r14) + adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 + ldmia up!, {r4, r5, r6} + SAVECY( r14, r11) + subs n, n, #3 + blt L(exi) + RESTCY( r12) + ADDSUBC r4, r4, r8 + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, r10 + ldmia vp!, {r8, r9, r10} + stmia rp!, {r4, r5, r6} + REVCY(r12) +L(mid): adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 + ldmia up!, {r4, r5, r6} + SAVECY( r12, r11) + subs n, n, #3 + bge L(top) + + mov r7, r12 C swap alternating... + mov r12, r14 C ...carry-save... + mov r14, r7 C ...registers + +L(exi): RESTCY( r12) + ADDSUBC r4, r4, r8 + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, r10 + stmia rp!, {r4, r5, r6} + + REVCY(r12) +L(le2): tst n, #1 C n = {-1,-2,-3} map to [2], [1], [0] + beq L(e1) + +L(e02): tst n, #2 + beq L(rt0) + ldm vp, {r8, r9} + adcs r8, r8, r8 + adcs r9, r9, r9 + ldm up, {r4, r5} + SAVECY( r12, r11) + RESTCY( r14) + ADDSUBC r4, r4, r8 + ADDSUBC r5, r5, r9 + stm rp, {r4, r5} + b L(rt1) + +L(e1): ldr r8, [vp] + adcs r8, r8, r8 + ldr r4, [up] + SAVECY( r12, r11) + RESTCY( r14) + ADDSUBC r4, r4, r8 + str r4, [rp] + +L(rt1): mov r14, r12 + REVCY(r12) +L(rt0): RETVAL( r14) + pop {r4-r10r11, r14} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/aorsmul_1.asm b/gmp-6.3.0/mpn/arm/aorsmul_1.asm new file mode 100644 index 0000000..b02fbb3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/aorsmul_1.asm @@ -0,0 +1,135 @@ +dnl ARM mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.25 +C Cortex-A15 4 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`vl', `r3') +define(`rl', `r12') +define(`ul', `r6') +define(`r', `lr') + +ifdef(`OPERATION_addmul_1', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`CLRRCY', `mov $1, #0 + adds r0, r0, #0') + define(`RETVAL', `adc r0, r4, #0') + define(`func', mpn_addmul_1)') +ifdef(`OPERATION_submul_1', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`CLRRCY', `subs $1, r0, r0') + define(`RETVAL', `sbc r0, r0, r0 + sub r0, $1, r0') + define(`func', mpn_submul_1)') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + stmfd sp!, { r4-r6, lr } + CLRRCY( r4) + tst n, #1 + beq L(skip1) + ldr ul, [up], #4 + ldr rl, [rp, #0] + umull r5, r4, ul, vl + ADDSUB r, rl, r5 + str r, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldr ul, [up], #4 + ldr rl, [rp, #0] + mov r5, #0 + umlal r4, r5, ul, vl + ldr ul, [up], #4 + ADDSUBC r, rl, r4 + ldr rl, [rp, #4] + mov r4, #0 + umlal r5, r4, ul, vl + str r, [rp], #4 + ADDSUBC r, rl, r5 + str r, [rp], #4 +L(skip2): + bics n, n, #3 + beq L(rtn) + + ldr ul, [up], #4 + ldr rl, [rp, #0] + mov r5, #0 + umlal r4, r5, ul, vl + b L(in) + +L(top): ldr ul, [up], #4 + ADDSUBC r, rl, r5 + ldr rl, [rp, #4] + mov r5, #0 + umlal r4, r5, ul, vl + str r, [rp], #4 +L(in): ldr ul, [up], #4 + ADDSUBC r, rl, r4 + ldr rl, [rp, #4] + mov r4, #0 + umlal r5, r4, ul, vl + str r, [rp], #4 + ldr ul, [up], #4 + ADDSUBC r, rl, r5 + ldr rl, [rp, #4] + mov r5, #0 + umlal r4, r5, ul, vl + str r, [rp], #4 + ldr ul, [up], #4 + ADDSUBC r, rl, r4 + ldr rl, [rp, #4] + mov r4, #0 + umlal r5, r4, ul, vl + sub n, n, #4 + tst n, n + str r, [rp], #4 + bne L(top) + + ADDSUBC r, rl, r5 + str r, [rp] + +L(rtn): RETVAL( r4) + ldmfd sp!, { r4-r6, pc } +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/arm-defs.m4 b/gmp-6.3.0/mpn/arm/arm-defs.m4 new file mode 100644 index 0000000..4b4fa0b --- /dev/null +++ b/gmp-6.3.0/mpn/arm/arm-defs.m4 @@ -0,0 +1,100 @@ +divert(-1) + +dnl m4 macros for ARM assembler. + +dnl Copyright 2001, 2012-2016, 2018-2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Standard commenting is with @, the default m4 # is for constants and we +dnl don't want to disable macro expansions in or after them. + +changecom(@&*$) + +define(`ASM_START', +m4_assert_numargs_range(0,1) +`ifelse($1,`neon',`.fpu neon', + $1,,`', + 1,1,`m4_error(`$0 got invalid argument $1')')') + +dnl APCS register names. + +deflit(a1,r0) +deflit(a2,r1) +deflit(a3,r2) +deflit(a4,r3) +dnl deflit(v1,r4) +dnl deflit(v2,r5) +dnl deflit(v3,r6) +dnl deflit(v4,r7) +dnl deflit(v5,r8) +dnl deflit(v6,r9) +deflit(sb,r9) +dnl deflit(v7,r10) +deflit(sl,r10) +deflit(fp,r11) +deflit(ip,r12) +dnl deflit(sp,r13) +deflit(lr,r14) +deflit(pc,r15) + + +define(`lea_list', `') +define(`lea_num',0) + +dnl LEA(reg,gmp_symbol) +dnl +dnl Load the address of gmp_symbol into a register. The gmp_symbol must be +dnl either local or protected/hidden, since we assume it has a fixed distance +dnl from the point of use. + +define(`LEA',`dnl +ldr $1, L(ptr`'lea_num) +ifdef(`PIC',dnl +`dnl +L(bas`'lea_num):dnl + add $1, $1, pc`'dnl + m4append(`lea_list',` +L(ptr'lea_num`): .word GSYM_PREFIX`'$2-L(bas'lea_num`)-8') + define(`lea_num', eval(lea_num+1))dnl +',`dnl + m4append(`lea_list',` +L(ptr'lea_num`): .word GSYM_PREFIX`'$2') + define(`lea_num', eval(lea_num+1))dnl +')dnl +') + +define(`return',`ifdef(`NOTHUMB',`mov pc, ',`bx')') + + +define(`EPILOGUE_cpu', +`lea_list + SIZE(`$1',.-`$1')' +`define(`lea_list', `')') + +divert diff --git a/gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm new file mode 100644 index 0000000..b919dc4 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm @@ -0,0 +1,113 @@ +dnl ARM mpn_bdiv_dbm1c. + +dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4.25 +C Cortex-A15 2.5 + +C TODO +C * Try using umlal or umaal. +C * Try using ldm/stm. + +define(`qp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`bd', `r3') +define(`cy', `sp,#0') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_dbm1c) + push {r4, r5, r6, r7, r8} + ldr r4, [up], #4 + ldr r5, [sp, #20] + ands r12, n, #3 + beq L(fi0) + cmp r12, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): umull r8, r12, r4, bd + ldr r4, [up], #4 + b L(lo3) + +L(fi0): umull r6, r7, r4, bd + ldr r4, [up], #4 + b L(lo0) + +L(fi1): subs n, n, #1 + umull r8, r12, r4, bd + bls L(wd1) + ldr r4, [up], #4 + b L(lo1) + +L(fi2): umull r6, r7, r4, bd + ldr r4, [up], #4 + b L(lo2) + +L(top): ldr r4, [up], #4 + subs r5, r5, r6 + str r5, [qp], #4 + sbc r5, r5, r7 +L(lo1): umull r6, r7, r4, bd + ldr r4, [up], #4 + subs r5, r5, r8 + str r5, [qp], #4 + sbc r5, r5, r12 +L(lo0): umull r8, r12, r4, bd + ldr r4, [up], #4 + subs r5, r5, r6 + str r5, [qp], #4 + sbc r5, r5, r7 +L(lo3): umull r6, r7, r4, bd + ldr r4, [up], #4 + subs r5, r5, r8 + str r5, [qp], #4 + sbc r5, r5, r12 +L(lo2): subs n, n, #4 + umull r8, r12, r4, bd + bhi L(top) + +L(wd2): subs r5, r5, r6 + str r5, [qp], #4 + sbc r5, r5, r7 +L(wd1): subs r5, r5, r8 + str r5, [qp] + sbc r0, r5, r12 + pop {r4, r5, r6, r7, r8} + return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/bdiv_q_1.asm new file mode 100644 index 0000000..ae395d1 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/bdiv_q_1.asm @@ -0,0 +1,162 @@ +dnl ARM v4 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C 1176 13 18 +C Cortex-A5 8 12 +C Cortex-A7 10.5 18 +C Cortex-A8 14 15 +C Cortex-A9 10 12 not measured since latest edits +C Cortex-A15 9 9 +C Cortex-A53 14 20 + +C Architecture requirements: +C v5 - +C v5t - +C v5te - +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') +define(`di_arg', `sp[0]') C just mpn_pi1_bdiv_q_1 +define(`cnt_arg', `sp[4]') C just mpn_pi1_bdiv_q_1 + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r8') + +ASM_START() +PROLOGUE(mpn_bdiv_q_1) + tst d, #1 + push {r6-r11} + mov cnt, #0 + bne L(inv) + +C count trailing zeros + movs r10, d, lsl #16 + moveq d, d, lsr #16 + moveq cnt, #16 + tst d, #0xff + moveq d, d, lsr #8 + addeq cnt, cnt, #8 + LEA( r10, ctz_tab) + and r11, d, #0xff + ldrb r10, [r10, r11] + mov d, d, lsr r10 + add cnt, cnt, r10 + +C binvert limb +L(inv): LEA( r10, binvert_limb_table) + and r12, d, #254 + ldrb r10, [r10, r12, lsr #1] + mul r12, r10, r10 + mul r12, d, r12 + rsb r12, r12, r10, lsl #1 + mul r10, r12, r12 + mul r10, d, r10 + rsb r10, r10, r12, lsl #1 C r10 = inverse + b L(pi1) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + push {r6-r11} + + ldr cnt, [sp, #28] + ldr r10, [sp, #24] + +L(pi1): ldr r11, [up], #4 C up[0] + cmp cnt, #0 + mov cy, #0 + bne L(unorm) + +L(norm): + subs n, n, #1 C set carry as side-effect + beq L(edn) + + ALIGN(16) +L(tpn): sbcs cy, r11, cy + ldr r11, [up], #4 + sub n, n, #1 + mul r9, r10, cy + tst n, n + umull r12, cy, d, r9 + str r9, [rp], #4 + bne L(tpn) + +L(edn): sbc cy, r11, cy + mul r9, r10, cy + str r9, [rp] + pop {r6-r11} + return r14 + +L(unorm): + rsb tnc, cnt, #32 + mov r11, r11, lsr cnt + subs n, n, #1 C set carry as side-effect + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + sbcs cy, r9, cy C critical path ->cy->cy-> + sub n, n, #1 + mul r9, r10, cy C critical path ->cy->r9-> + tst n, n + umull r12, cy, d, r9 C critical path ->r9->cy-> + str r9, [rp], #4 + bne L(tpu) + +L(edu): sbc cy, r11, cy + mul r9, r10, cy + str r9, [rp] + pop {r6-r11} + return r14 +EPILOGUE() + + RODATA +ctz_tab: + .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 diff --git a/gmp-6.3.0/mpn/arm/cnd_aors_n.asm b/gmp-6.3.0/mpn/arm/cnd_aors_n.asm new file mode 100644 index 0000000..0479f0d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/cnd_aors_n.asm @@ -0,0 +1,134 @@ +dnl ARM mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3 +C Cortex-A15 2.5 + +define(`cnd', `r0') +define(`rp', `r1') +define(`up', `r2') +define(`vp', `r3') + +define(`n', `r12') + + +ifdef(`OPERATION_cnd_add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`INITCY', `cmn r0, #0') + define(`RETVAL', `adc r0, n, #0') + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`INITCY', `cmp r0, #0') + define(`RETVAL', `adc r0, n, #0 + rsb r0, r0, #1') + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + push {r4-r11} + ldr n, [sp, #32] + + cmp cnd, #1 + sbc cnd, cnd, cnd C conditionally set to 0xffffffff + + INITCY C really only needed for n = 0 (mod 4) + + ands r4, n, #3 + beq L(top) + cmp r4, #2 + bcc L(b1) + beq L(b2) + +L(b3): ldm vp!, {r4,r5,r6} + ldm up!, {r8,r9,r10} + bic r4, r4, cnd + bic r5, r5, cnd + bic r6, r6, cnd + ADDSUB r8, r8, r4 + ADDSUBC r9, r9, r5 + ADDSUBC r10, r10, r6 + stm rp!, {r8,r9,r10} + sub n, n, #3 + teq n, #0 + bne L(top) + b L(end) + +L(b2): ldm vp!, {r4,r5} + ldm up!, {r8,r9} + bic r4, r4, cnd + bic r5, r5, cnd + ADDSUB r8, r8, r4 + ADDSUBC r9, r9, r5 + stm rp!, {r8,r9} + sub n, n, #2 + teq n, #0 + bne L(top) + b L(end) + +L(b1): ldr r4, [vp], #4 + ldr r8, [up], #4 + bic r4, r4, cnd + ADDSUB r8, r8, r4 + str r8, [rp], #4 + sub n, n, #1 + teq n, #0 + beq L(end) + +L(top): ldm vp!, {r4,r5,r6,r7} + ldm up!, {r8,r9,r10,r11} + bic r4, r4, cnd + bic r5, r5, cnd + bic r6, r6, cnd + bic r7, r7, cnd + ADDSUBC r8, r8, r4 + ADDSUBC r9, r9, r5 + ADDSUBC r10, r10, r6 + ADDSUBC r11, r11, r7 + sub n, n, #4 + stm rp!, {r8,r9,r10,r11} + teq n, #0 + bne L(top) + +L(end): RETVAL + pop {r4-r11} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/com.asm b/gmp-6.3.0/mpn/arm/com.asm new file mode 100644 index 0000000..850b10a --- /dev/null +++ b/gmp-6.3.0/mpn/arm/com.asm @@ -0,0 +1,75 @@ +dnl ARM mpn_com. + +dnl Copyright 2003, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 2.0 +C Cortex-A15 1.75 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_com) + tst n, #1 + beq L(skip1) + ldr r3, [up], #4 + mvn r3, r3 + str r3, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmia up!, { r3, r12 } C load 2 limbs + mvn r3, r3 + mvn r12, r12 + stmia rp!, { r3, r12 } C store 2 limbs +L(skip2): + bics n, n, #3 + beq L(rtn) + stmfd sp!, { r7, r8, r9 } C save regs on stack + +L(top): ldmia up!, { r3, r8, r9, r12 } C load 4 limbs + subs n, n, #4 + mvn r3, r3 + mvn r8, r8 + mvn r9, r9 + mvn r12, r12 + stmia rp!, { r3, r8, r9, r12 } C store 4 limbs + bne L(top) + + ldmfd sp!, { r7, r8, r9 } C restore regs from stack +L(rtn): return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/copyd.asm b/gmp-6.3.0/mpn/arm/copyd.asm new file mode 100644 index 0000000..bcad98d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/copyd.asm @@ -0,0 +1,84 @@ +dnl ARM mpn_copyd. + +dnl Contributed to the GNU project by Robert Harley and Torbjörn Granlund. + +dnl Copyright 2003, 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.25-1.5 +C Cortex-A15 1.25 + +C TODO +C * Consider wider unrolling. Analogous 8-way code runs 10% faster on both A9 +C and A15. But it probably slows things down for 8 <= n < a few dozen. + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyd) + mov r12, n, lsl #2 + sub r12, r12, #4 + add rp, rp, r12 + add up, up, r12 + + tst n, #1 + beq L(skip1) + ldr r3, [up], #-4 + str r3, [rp], #-4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmda up!, { r3,r12 } + stmda rp!, { r3,r12 } +L(skip2): + bics n, n, #3 + beq L(rtn) + + push { r4-r5 } + subs n, n, #4 + ldmda up!, { r3,r4,r5,r12 } + beq L(end) + +L(top): subs n, n, #4 + stmda rp!, { r3,r4,r5,r12 } + ldmda up!, { r3,r4,r5,r12 } + bne L(top) + +L(end): stmda rp, { r3,r4,r5,r12 } + pop { r4-r5 } +L(rtn): return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/copyi.asm b/gmp-6.3.0/mpn/arm/copyi.asm new file mode 100644 index 0000000..421930f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/copyi.asm @@ -0,0 +1,79 @@ +dnl ARM mpn_copyi. + +dnl Contributed to the GNU project by Robert Harley and Torbjörn Granlund. + +dnl Copyright 2003, 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.25-1.5 +C Cortex-A15 1.25 + +C TODO +C * Consider wider unrolling. Analogous 8-way code runs 10% faster on both A9 +C and A15. But it probably slows things down for 8 <= n < a few dozen. + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyi) + tst n, #1 + beq L(skip1) + ldr r3, [up], #4 + str r3, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmia up!, { r3,r12 } + stmia rp!, { r3,r12 } +L(skip2): + bics n, n, #3 + beq L(rtn) + + push { r4-r5 } + subs n, n, #4 + ldmia up!, { r3,r4,r5,r12 } + beq L(end) + +L(top): subs n, n, #4 + stmia rp!, { r3,r4,r5,r12 } + ldmia up!, { r3,r4,r5,r12 } + bne L(top) + +L(end): stm rp, { r3,r4,r5,r12 } + pop { r4-r5 } +L(rtn): return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/dive_1.asm b/gmp-6.3.0/mpn/arm/dive_1.asm new file mode 100644 index 0000000..8bffb0c --- /dev/null +++ b/gmp-6.3.0/mpn/arm/dive_1.asm @@ -0,0 +1,151 @@ +dnl ARM v4 mpn_divexact_1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C norm unorm modexact_1c_odd +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 10 12 +C Cortex-A15 9 9 + +C Architecture requirements: +C v5 - +C v5t - +C v5te - +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r8') + +ASM_START() +PROLOGUE(mpn_divexact_1) + tst d, #1 + push {r4-r9} + mov cnt, #0 + bne L(inv) + +C count trailing zeros + movs r4, d, lsl #16 + moveq d, d, lsr #16 + moveq cnt, #16 + tst d, #0xff + moveq d, d, lsr #8 + addeq cnt, cnt, #8 + LEA( r4, ctz_tab) + and r5, d, #0xff + ldrb r4, [r4, r5] + mov d, d, lsr r4 + add cnt, cnt, r4 + +C binvert limb +L(inv): LEA( r4, binvert_limb_table) + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + mul r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, lsl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, lsl #1 C r4 = inverse + + tst cnt, cnt + ldr r5, [up], #4 C up[0] + mov cy, #0 + bne L(unnorm) + +L(norm): + subs n, n, #1 C set carry as side-effect + beq L(edn) + + ALIGN(16) +L(tpn): sbcs cy, r5, cy + ldr r5, [up], #4 + sub n, n, #1 + mul r9, r4, cy + tst n, n + umull r12, cy, d, r9 + str r9, [rp], #4 + bne L(tpn) + +L(edn): sbc cy, r5, cy + mul r9, r4, cy + str r9, [rp] + pop {r4-r9} + return r14 + +L(unnorm): + rsb tnc, cnt, #32 + mov r5, r5, lsr cnt + subs n, n, #1 C set carry as side-effect + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r5, r12, lsl tnc + mov r5, r12, lsr cnt + sbcs cy, r9, cy C critical path ->cy->cy-> + sub n, n, #1 + mul r9, r4, cy C critical path ->cy->r9-> + tst n, n + umull r12, cy, d, r9 C critical path ->r9->cy-> + str r9, [rp], #4 + bne L(tpu) + +L(edu): sbc cy, r5, cy + mul r9, r4, cy + str r9, [rp] + pop {r4-r9} + return r14 +EPILOGUE() + + RODATA +ctz_tab: + .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 diff --git a/gmp-6.3.0/mpn/arm/gmp-mparam.h b/gmp-6.3.0/mpn/arm/gmp-mparam.h new file mode 100644 index 0000000..87eec3a --- /dev/null +++ b/gmp-6.3.0/mpn/arm/gmp-mparam.h @@ -0,0 +1,127 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1193MHz ARM (gcc55.fsffrance.org) */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 56 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 11 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 71 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIVREM_2_THRESHOLD 0 /* preinv always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 41 + +#define MUL_TOOM22_THRESHOLD 36 +#define MUL_TOOM33_THRESHOLD 125 +#define MUL_TOOM44_THRESHOLD 193 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 418 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 125 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 176 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 + +#define SQR_BASECASE_THRESHOLD 12 +#define SQR_TOOM2_THRESHOLD 78 +#define SQR_TOOM3_THRESHOLD 137 +#define SQR_TOOM4_THRESHOLD 212 +#define SQR_TOOM6_THRESHOLD 306 +#define SQR_TOOM8_THRESHOLD 422 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 26 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 27, 6}, { 28, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 256, 9}, { 512,10}, { 1024,11}, { 2048,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 28 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 13, 4}, { 27, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 512,10}, \ + { 1024,11}, { 2048,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 26 +#define SQR_FFT_THRESHOLD 3776 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 137 +#define MULLO_MUL_N_THRESHOLD 11479 + +#define DC_DIV_QR_THRESHOLD 150 +#define DC_DIVAPPR_Q_THRESHOLD 494 +#define DC_BDIV_QR_THRESHOLD 148 +#define DC_BDIV_Q_THRESHOLD 345 + +#define INV_MULMOD_BNM1_THRESHOLD 70 +#define INV_NEWTON_THRESHOLD 474 +#define INV_APPR_THRESHOLD 478 + +#define BINV_NEWTON_THRESHOLD 542 +#define REDC_1_TO_REDC_N_THRESHOLD 117 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 2172 +#define MUPI_DIV_QR_THRESHOLD 225 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 2089 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 197 +#define GCD_DC_THRESHOLD 902 +#define GCDEXT_DC_THRESHOLD 650 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 39 +#define SET_STR_DC_THRESHOLD 1045 +#define SET_STR_PRECOMPUTE_THRESHOLD 2147 diff --git a/gmp-6.3.0/mpn/arm/invert_limb.asm b/gmp-6.3.0/mpn/arm/invert_limb.asm new file mode 100644 index 0000000..af7502d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/invert_limb.asm @@ -0,0 +1,93 @@ +dnl ARM mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 2001, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_invert_limb) + LEA( r2, approx_tab-512) + mov r3, r0, lsr #23 + mov r3, r3, asl #1 + ldrh r3, [r3, r2] + mov r1, r3, asl #17 + mul r12, r3, r3 + umull r3, r2, r12, r0 + sub r1, r1, r2, asl #1 + umull r3, r2, r1, r1 + umull r12, r3, r0, r3 + umull r2, r12, r0, r2 + adds r2, r2, r3 + adc r12, r12, #0 + rsb r1, r12, r1 + mvn r2, r2, lsr #30 + add r2, r2, r1, asl #2 + umull r12, r3, r0, r2 + adds r1, r12, r0 + adc r3, r3, r0 + rsb r0, r3, r2 + return lr +EPILOGUE() + + RODATA + ALIGN(2) +approx_tab: + .short 0xffc0,0xfec0,0xfdc0,0xfcc0,0xfbc0,0xfac0,0xfa00,0xf900 + .short 0xf800,0xf700,0xf640,0xf540,0xf440,0xf380,0xf280,0xf180 + .short 0xf0c0,0xefc0,0xef00,0xee00,0xed40,0xec40,0xeb80,0xeac0 + .short 0xe9c0,0xe900,0xe840,0xe740,0xe680,0xe5c0,0xe500,0xe400 + .short 0xe340,0xe280,0xe1c0,0xe100,0xe040,0xdf80,0xdec0,0xde00 + .short 0xdd40,0xdc80,0xdbc0,0xdb00,0xda40,0xd980,0xd8c0,0xd800 + .short 0xd740,0xd680,0xd600,0xd540,0xd480,0xd3c0,0xd340,0xd280 + .short 0xd1c0,0xd140,0xd080,0xcfc0,0xcf40,0xce80,0xcdc0,0xcd40 + .short 0xcc80,0xcc00,0xcb40,0xcac0,0xca00,0xc980,0xc8c0,0xc840 + .short 0xc780,0xc700,0xc640,0xc5c0,0xc540,0xc480,0xc400,0xc380 + .short 0xc2c0,0xc240,0xc1c0,0xc100,0xc080,0xc000,0xbf80,0xbec0 + .short 0xbe40,0xbdc0,0xbd40,0xbc80,0xbc00,0xbb80,0xbb00,0xba80 + .short 0xba00,0xb980,0xb900,0xb840,0xb7c0,0xb740,0xb6c0,0xb640 + .short 0xb5c0,0xb540,0xb4c0,0xb440,0xb3c0,0xb340,0xb2c0,0xb240 + .short 0xb1c0,0xb140,0xb0c0,0xb080,0xb000,0xaf80,0xaf00,0xae80 + .short 0xae00,0xad80,0xad40,0xacc0,0xac40,0xabc0,0xab40,0xaac0 + .short 0xaa80,0xaa00,0xa980,0xa900,0xa8c0,0xa840,0xa7c0,0xa740 + .short 0xa700,0xa680,0xa600,0xa5c0,0xa540,0xa4c0,0xa480,0xa400 + .short 0xa380,0xa340,0xa2c0,0xa240,0xa200,0xa180,0xa140,0xa0c0 + .short 0xa080,0xa000,0x9f80,0x9f40,0x9ec0,0x9e80,0x9e00,0x9dc0 + .short 0x9d40,0x9d00,0x9c80,0x9c40,0x9bc0,0x9b80,0x9b00,0x9ac0 + .short 0x9a40,0x9a00,0x9980,0x9940,0x98c0,0x9880,0x9840,0x97c0 + .short 0x9780,0x9700,0x96c0,0x9680,0x9600,0x95c0,0x9580,0x9500 + .short 0x94c0,0x9440,0x9400,0x93c0,0x9340,0x9300,0x92c0,0x9240 + .short 0x9200,0x91c0,0x9180,0x9100,0x90c0,0x9080,0x9000,0x8fc0 + .short 0x8f80,0x8f40,0x8ec0,0x8e80,0x8e40,0x8e00,0x8d80,0x8d40 + .short 0x8d00,0x8cc0,0x8c80,0x8c00,0x8bc0,0x8b80,0x8b40,0x8b00 + .short 0x8a80,0x8a40,0x8a00,0x89c0,0x8980,0x8940,0x88c0,0x8880 + .short 0x8840,0x8800,0x87c0,0x8780,0x8740,0x8700,0x8680,0x8640 + .short 0x8600,0x85c0,0x8580,0x8540,0x8500,0x84c0,0x8480,0x8440 + .short 0x8400,0x8380,0x8340,0x8300,0x82c0,0x8280,0x8240,0x8200 + .short 0x81c0,0x8180,0x8140,0x8100,0x80c0,0x8080,0x8040,0x8000 +ASM_END() diff --git a/gmp-6.3.0/mpn/arm/logops_n.asm b/gmp-6.3.0/mpn/arm/logops_n.asm new file mode 100644 index 0000000..7e04165 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/logops_n.asm @@ -0,0 +1,139 @@ +dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C and andn ior xor nand iorn nior xnor +C StrongARM ? ? +C XScale ? ? +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 2.5-2.72 2.75-3 +C Cortex-A15 2.25 2.75 + +C TODO +C * It seems that 2.25 c/l and 2.75 c/l is possible for A9. +C * Debug popping issue, see comment below. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +define(`POSTOP') + +ifdef(`OPERATION_and_n',` + define(`func', `mpn_and_n') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_andn_n',` + define(`func', `mpn_andn_n') + define(`LOGOP', `bic $1, $2, $3')') +ifdef(`OPERATION_nand_n',` + define(`func', `mpn_nand_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_ior_n',` + define(`func', `mpn_ior_n') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_iorn_n',` + define(`func', `mpn_iorn_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `bic $1, $3, $2')') +ifdef(`OPERATION_nior_n',` + define(`func', `mpn_nior_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_xor_n',` + define(`func', `mpn_xor_n') + define(`LOGOP', `eor $1, $2, $3')') +ifdef(`OPERATION_xnor_n',` + define(`func', `mpn_xnor_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `eor $1, $2, $3')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + push { r8, r9, r10 } + tst n, #1 + beq L(skip1) + ldr r10, [vp], #4 + ldr r12, [up], #4 + LOGOP( r12, r12, r10) + POSTOP( r12) + str r12, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmia vp!, { r10, r12 } + ldmia up!, { r8, r9 } + LOGOP( r8, r8, r10) + LOGOP( r9, r9, r12) + POSTOP( r8) + POSTOP( r9) + stmia rp!, { r8, r9 } +L(skip2): + bics n, n, #3 + beq L(rtn) + push { r4, r5, r6, r7 } + + ldmia vp!, { r8, r9, r10, r12 } + b L(mid) + +L(top): ldmia vp!, { r8, r9, r10, r12 } + POSTOP( r4) + POSTOP( r5) + POSTOP( r6) + POSTOP( r7) + stmia rp!, { r4, r5, r6, r7 } +L(mid): sub n, n, #4 + ldmia up!, { r4, r5, r6, r7 } + teq n, #0 + LOGOP( r4, r4, r8) + LOGOP( r5, r5, r9) + LOGOP( r6, r6, r10) + LOGOP( r7, r7, r12) + bne L(top) + + POSTOP( r4) + POSTOP( r5) + POSTOP( r6) + POSTOP( r7) + stmia rp!, { r4, r5, r6, r7 } + + pop { r4, r5, r6, r7 } C popping r8-r10 here strangely fails + +L(rtn): pop { r8, r9, r10 } + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/lshift.asm b/gmp-6.3.0/mpn/arm/lshift.asm new file mode 100644 index 0000000..1d5ce0a --- /dev/null +++ b/gmp-6.3.0/mpn/arm/lshift.asm @@ -0,0 +1,88 @@ +dnl ARM mpn_lshift. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.5 +C Cortex-A15 ? + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`cnt', `r3') +define(`tnc', `r12') + +ASM_START() +PROLOGUE(mpn_lshift) + add up, up, n, lsl #2 + push {r4, r6, r7, r8} + ldr r4, [up, #-4]! + add rp, rp, n, lsl #2 + rsb tnc, cnt, #32 + + mov r7, r4, lsl cnt + tst n, #1 + beq L(evn) C n even + +L(odd): subs n, n, #2 + bcc L(1) C n = 1 + ldr r8, [up, #-4]! + b L(mid) + +L(evn): ldr r6, [up, #-4]! + subs n, n, #2 + beq L(end) + +L(top): ldr r8, [up, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(mid): ldr r6, [up, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mov r7, r8, lsl cnt + subs n, n, #2 + bgt L(top) + +L(end): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(1): str r7, [rp, #-4] + mov r0, r4, lsr tnc + pop {r4, r6, r7, r8} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/lshiftc.asm b/gmp-6.3.0/mpn/arm/lshiftc.asm new file mode 100644 index 0000000..e5b52df --- /dev/null +++ b/gmp-6.3.0/mpn/arm/lshiftc.asm @@ -0,0 +1,95 @@ +dnl ARM mpn_lshiftc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4.0 +C Cortex-A15 ? + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`cnt', `r3') +define(`tnc', `r12') + +ASM_START() +PROLOGUE(mpn_lshiftc) + add up, up, n, lsl #2 + push {r4, r6, r7, r8} + ldr r4, [up, #-4]! + add rp, rp, n, lsl #2 + rsb tnc, cnt, #32 + mvn r6, r4 + + mov r7, r6, lsl cnt + tst n, #1 + beq L(evn) C n even + +L(odd): subs n, n, #2 + bcc L(1) C n = 1 + ldr r8, [up, #-4]! + mvn r8, r8 + b L(mid) + +L(evn): ldr r6, [up, #-4]! + mvn r6, r6 + subs n, n, #2 + beq L(end) + +L(top): ldr r8, [up, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mvn r8, r8 + mov r7, r6, lsl cnt +L(mid): ldr r6, [up, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mvn r6, r6 + mov r7, r8, lsl cnt + subs n, n, #2 + bgt L(top) + +L(end): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(1): mvn r6, #0 + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4] + mov r0, r4, lsr tnc + pop {r4, r6, r7, r8} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/mod_34lsub1.asm b/gmp-6.3.0/mpn/arm/mod_34lsub1.asm new file mode 100644 index 0000000..596cd3c --- /dev/null +++ b/gmp-6.3.0/mpn/arm/mod_34lsub1.asm @@ -0,0 +1,124 @@ +dnl ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A5 2.67 +C Cortex-A7 2.35 +C Cortex-A8 2.0 +C Cortex-A9 1.33 +C Cortex-A15 1.33 +C Cortex-A17 3.34 +C Cortex-A53 2.0 + +define(`ap', r0) +define(`n', r1) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Write cleverer summation code. +C * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l. + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + push { r4, r5, r6, r7 } + + subs n, n, #3 + mov r7, #0 + blt L(le2) C n <= 2 + + ldmia ap!, { r2, r3, r12 } + subs n, n, #3 + blt L(sum) C n <= 5 + cmn r0, #0 C clear carry + sub n, n, #3 + b L(mid) + +L(top): adcs r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 +L(mid): ldmia ap!, { r4, r5, r6 } + tst n, n + sub n, n, #3 + bpl L(top) + + add n, n, #3 + + adcs r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 + movcs r7, #1 C r7 <= 1 + +L(sum): cmn n, #2 + movlo r4, #0 + ldrhs r4, [ap], #4 + movls r5, #0 + ldrhi r5, [ap], #4 + + adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, #0 + adc r7, r7, #0 C r7 <= 2 + +L(sum2): + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + add r0, r0, r7 + + mov r7, r3, lsl #8 + bic r1, r7, #0xff000000 + add r0, r0, r1 + add r0, r0, r3, lsr #16 + + mov r7, r12, lsl #16 + bic r1, r7, #0xff000000 + add r0, r0, r1 + add r0, r0, r12, lsr #8 + + pop { r4, r5, r6, r7 } + return lr + +L(le2): cmn n, #1 + bne L(1) + ldmia ap!, { r2, r3 } + mov r12, #0 + b L(sum2) +L(1): ldr r2, [ap] + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + pop { r4, r5, r6, r7 } + return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/mode1o.asm b/gmp-6.3.0/mpn/arm/mode1o.asm new file mode 100644 index 0000000..63a7f36 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/mode1o.asm @@ -0,0 +1,92 @@ +dnl ARM mpn_modexact_1c_odd + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 10 +C Cortex-A15 9 + +C Architecture requirements: +C v5 - +C v5t - +C v5te - +C v6 - +C v6t2 - +C v7a - + +define(`up', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cy', `r3') + + .protected binvert_limb_table +ASM_START() +PROLOGUE(mpn_modexact_1c_odd) + stmfd sp!, {r4, r5} + + LEA( r4, binvert_limb_table) + + ldr r5, [up], #4 C up[0] + + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + mul r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, asl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, asl #1 C r4 = inverse + + subs n, n, #1 C set carry as side-effect + beq L(end) + +L(top): sbcs cy, r5, cy + ldr r5, [up], #4 + sub n, n, #1 + mul r12, r4, cy + tst n, n + umull r12, cy, d, r12 + bne L(top) + +L(end): sbcs cy, r5, cy + mul r12, r4, cy + umull r12, r0, d, r12 + addcc r0, r0, #1 + + ldmfd sp!, {r4, r5} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/mul_1.asm b/gmp-6.3.0/mpn/arm/mul_1.asm new file mode 100644 index 0000000..f7bc1bc --- /dev/null +++ b/gmp-6.3.0/mpn/arm/mul_1.asm @@ -0,0 +1,94 @@ +dnl ARM mpn_mul_1 -- Multiply a limb vector with a limb and store the result +dnl in a second limb vector. +dnl Contributed by Robert Harley. + +dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM 6-8 +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4.75 +C Cortex-A15 ? + +C We should rewrite this along the lines of addmul_1.asm. That should save a +C cycle on StrongARM, and several cycles on XScale. + +define(`rp',`r0') +define(`up',`r1') +define(`n',`r2') +define(`vl',`r3') + + +ASM_START() +PROLOGUE(mpn_mul_1) + stmfd sp!, { r8, r9, lr } + ands r12, n, #1 + beq L(skip1) + ldr lr, [up], #4 + umull r9, r12, lr, vl + str r9, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + mov r8, r12 + ldmia up!, { r12, lr } + mov r9, #0 + umlal r8, r9, r12, vl + mov r12, #0 + umlal r9, r12, lr, vl + stmia rp!, { r8, r9 } +L(skip2): + bics n, n, #3 + beq L(rtn) + stmfd sp!, { r6, r7 } + +L(top): mov r6, r12 + ldmia up!, { r8, r9, r12, lr } + ldr r7, [rp, #12] C cache allocate + mov r7, #0 + umlal r6, r7, r8, vl + mov r8, #0 + umlal r7, r8, r9, vl + mov r9, #0 + umlal r8, r9, r12, vl + mov r12, #0 + umlal r9, r12, lr, vl + subs n, n, #4 + stmia rp!, { r6, r7, r8, r9 } + bne L(top) + + ldmfd sp!, { r6, r7 } + +L(rtn): mov r0, r12 + ldmfd sp!, { r8, r9, pc } +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/README b/gmp-6.3.0/mpn/arm/neon/README new file mode 100644 index 0000000..79e3b48 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/README @@ -0,0 +1,2 @@ +This directory contains Neon code which runs and is efficient on all +ARM CPUs which support Neon. diff --git a/gmp-6.3.0/mpn/arm/neon/hamdist.asm b/gmp-6.3.0/mpn/arm/neon/hamdist.asm new file mode 100644 index 0000000..2320896 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/hamdist.asm @@ -0,0 +1,194 @@ +dnl ARM Neon mpn_hamdist -- mpn bit hamming distance. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.89 +C Cortex-A15 0.95 + +C TODO +C * Explore using vldr and vldm. Does it help on A9? (These loads do +C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for +C popcount. Except perhaps also for popcount for the edge loads.) +C * Arrange to align the pointer, if that helps performance. Use the same +C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry +C valgrind!) +C * Explore if explicit align directives, e.g., "[ptr:128]" help. +C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. + +C INPUT PARAMETERS +define(`ap', r0) +define(`bp', r1) +define(`n', r2) + +C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end +C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or +C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which +C can be represented as a 8-bit ARM constant. +C +define(`chunksize',0x3f80) + +ASM_START() +PROLOGUE(mpn_hamdist) + + cmp n, #chunksize + bhi L(gt16k) + +L(lt16k): + vmov.i64 q8, #0 C clear summation register + vmov.i64 q9, #0 C clear summation register + + tst n, #1 + beq L(xxx0) + vmov.i64 d0, #0 + vmov.i64 d20, #0 + sub n, n, #1 + vld1.32 {d0[0]}, [ap]! C load 1 limb + vld1.32 {d20[0]}, [bp]! C load 1 limb + veor d0, d0, d20 + vcnt.8 d24, d0 + vpadal.u8 d16, d24 C d16/q8 = 0; could just splat + +L(xxx0):tst n, #2 + beq L(xx00) + sub n, n, #2 + vld1.32 {d0}, [ap]! C load 2 limbs + vld1.32 {d20}, [bp]! C load 2 limbs + veor d0, d0, d20 + vcnt.8 d24, d0 + vpadal.u8 d16, d24 + +L(xx00):tst n, #4 + beq L(x000) + sub n, n, #4 + vld1.32 {q0}, [ap]! C load 4 limbs + vld1.32 {q10}, [bp]! C load 4 limbs + veor q0, q0, q10 + vcnt.8 q12, q0 + vpadal.u8 q8, q12 + +L(x000):tst n, #8 + beq L(0000) + + subs n, n, #8 + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + bls L(sum) + +L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + veor q0, q0, q10 + veor q1, q1, q11 + sub n, n, #8 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + b L(mid) + +L(0000):subs n, n, #16 + blo L(e0) + + vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + veor q2, q2, q14 + veor q3, q3, q15 + vcnt.8 q12, q2 + vcnt.8 q13, q3 + subs n, n, #16 + blo L(end) + +L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + veor q0, q0, q10 + veor q1, q1, q11 + vpadal.u8 q8, q12 + vcnt.8 q12, q0 + vpadal.u8 q9, q13 + vcnt.8 q13, q1 +L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + veor q2, q2, q14 + veor q3, q3, q15 + subs n, n, #16 + vpadal.u8 q8, q12 + vcnt.8 q12, q2 + vpadal.u8 q9, q13 + vcnt.8 q13, q3 + bhs L(top) + +L(end): vpadal.u8 q8, q12 + vpadal.u8 q9, q13 +L(sum): veor q0, q0, q10 + veor q1, q1, q11 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + vpadal.u8 q8, q12 + vpadal.u8 q9, q13 + vadd.i16 q8, q8, q9 + C we have 8 16-bit counts +L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts + vpaddl.u32 q8, q8 C we have 2 64-bit counts + vmov.32 r0, d16[0] + vmov.32 r1, d17[0] + add r0, r0, r1 + bx lr + +C Code for large count. Splits operand and calls above code. +define(`ap2', r5) +define(`bp2', r6) +L(gt16k): + push {r4,r5,r6,r14} + mov ap2, ap + mov bp2, bp + mov r3, n C full count + mov r4, #0 C total sum + +1: mov n, #chunksize C count for this invocation + bl L(lt16k) C could jump deep inside code + add ap2, ap2, #chunksize*4 C point at next chunk + add bp2, bp2, #chunksize*4 C point at next chunk + add r4, r4, r0 + mov ap, ap2 C put chunk pointer in place for call + mov bp, bp2 C put chunk pointer in place for call + sub r3, r3, #chunksize + cmp r3, #chunksize + bhi 1b + + mov n, r3 C count for final invocation + bl L(lt16k) + add r0, r4, r0 + pop {r4,r5,r6,pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/lorrshift.asm b/gmp-6.3.0/mpn/arm/neon/lorrshift.asm new file mode 100644 index 0000000..7ebc780 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/lorrshift.asm @@ -0,0 +1,279 @@ +dnl ARM Neon mpn_lshift and mpn_rshift. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3 3 Y +C Cortex-A15 1.5 1.5 Y + + +C We read 64 bits at a time at 32-bit aligned addresses, and except for the +C first and last store, we write using 64-bit aligned addresses. All shifting +C is done on 64-bit words in 'extension' registers. +C +C It should be possible to read also using 64-bit alignment, by manipulating +C the shift count for unaligned operands. Not done, since it does not seem to +C matter for A9 or A15. +C +C This will not work in big-endian mode. + +C TODO +C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts, +C which might make it tricky. +C * Clean up and simplify. +C * Consider sharing most of the code for lshift and rshift, since the feed-in +C code, the loop, and most of the wind-down code are identical. +C * Replace the basecase code with code using 'extension' registers. +C * Optimise. It is not clear that this loop insn permutation is optimal for +C either A9 or A15. + +C INPUT PARAMETERS +define(`rp', `r0') +define(`ap', `r1') +define(`n', `r2') +define(`cnt', `r3') + +ifdef(`OPERATION_lshift',` + define(`IFLSH', `$1') + define(`IFRSH', `') + define(`X',`0') + define(`Y',`1') + define(`func',`mpn_lshift') +') +ifdef(`OPERATION_rshift',` + define(`IFLSH', `') + define(`IFRSH', `$1') + define(`X',`1') + define(`Y',`0') + define(`func',`mpn_rshift') +') + +MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) + +ASM_START(neon) + TEXT + ALIGN(64) +PROLOGUE(func) +IFLSH(` mov r12, n, lsl #2 ') +IFLSH(` add rp, rp, r12 ') +IFLSH(` add ap, ap, r12 ') + + cmp n, #4 C SIMD code n limit + ble L(base) + +ifdef(`OPERATION_lshift',` + vdup.32 d6, r3 C left shift count is positive + sub r3, r3, #64 C right shift count is negative + vdup.32 d7, r3 + mov r12, #-8') C lshift pointer update offset +ifdef(`OPERATION_rshift',` + rsb r3, r3, #0 C right shift count is negative + vdup.32 d6, r3 + add r3, r3, #64 C left shift count is positive + vdup.32 d7, r3 + mov r12, #8') C rshift pointer update offset + +IFLSH(` sub ap, ap, #8 ') + vld1.32 {d19}, [ap], r12 C load initial 2 limbs + vshl.u64 d18, d19, d7 C retval + + tst rp, #4 C is rp 64-bit aligned already? + beq L(rp_aligned) C yes, skip +IFLSH(` add ap, ap, #4 ') C move back ap pointer +IFRSH(` sub ap, ap, #4 ') C move back ap pointer + vshl.u64 d4, d19, d6 + sub n, n, #1 C first limb handled +IFLSH(` sub rp, rp, #4 ') + vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned + vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2] + +L(rp_aligned): +IFLSH(` sub rp, rp, #8 ') + subs n, n, #6 + blt L(two_or_three_more) + tst n, #2 + beq L(2) + +L(1): vld1.32 {d17}, [ap], r12 + vshl.u64 d5, d19, d6 + vld1.32 {d16}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + sub n, n, #2 + b L(mid) + +L(2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vld1.32 {d17}, [ap], r12 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + subs n, n, #4 + blt L(end) + +L(top): vld1.32 {d16}, [ap], r12 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 +L(mid): vld1.32 {d17}, [ap], r12 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + subs n, n, #4 + bge L(top) + +L(end): tst n, #1 + beq L(evn) + + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + b L(cj1) + +L(evn): vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d16, d17, d6 + vst1.32 {d2}, [rp:64], r12 + vorr d2, d5, d0 + b L(cj2) + +C Load last 2 - 3 limbs, store last 4 - 5 limbs +L(two_or_three_more): + tst n, #1 + beq L(l2) + +L(l3): vshl.u64 d5, d19, d6 + vld1.32 {d17}, [ap], r12 +L(cj1): veor d16, d16, d16 +IFLSH(` add ap, ap, #4 ') + vld1.32 {d16[Y]}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 +IFLSH(` add rp, rp, #4 ') + vst1.32 {d5[Y]}, [rp] + vmov.32 r0, d18[X] + bx lr + +L(l2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vshl.u64 d1, d16, d7 + vshl.u64 d16, d16, d6 + vorr d2, d4, d1 +L(cj2): vst1.32 {d2}, [rp:64], r12 + vst1.32 {d16}, [rp] + vmov.32 r0, d18[X] + bx lr + + +define(`tnc', `r12') +L(base): + push {r4, r6, r7, r8} +ifdef(`OPERATION_lshift',` + ldr r4, [ap, #-4]! + rsb tnc, cnt, #32 + + mov r7, r4, lsl cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #-4]! + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #-4]! + subs n, n, #2 + beq L(ed) C n = 3 + C n = 4 +L(tp): ldr r8, [ap, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(md): ldr r6, [ap, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mov r7, r8, lsl cnt + +L(ed): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(ed1): str r7, [rp, #-4] + mov r0, r4, lsr tnc +') +ifdef(`OPERATION_rshift',` + ldr r4, [ap] + rsb tnc, cnt, #32 + + mov r7, r4, lsr cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #4]! + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #4]! + subs n, n, #2 + beq L(ed) C n = 2 + C n = 4 + +L(tp): ldr r8, [ap, #4]! + orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(md): ldr r6, [ap, #4]! + orr r7, r7, r8, lsl tnc + str r7, [rp], #4 + mov r7, r8, lsr cnt + +L(ed): orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(ed1): str r7, [rp], #4 + mov r0, r4, lsl tnc +') + pop {r4, r6, r7, r8} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/lshiftc.asm b/gmp-6.3.0/mpn/arm/neon/lshiftc.asm new file mode 100644 index 0000000..f1bf0de --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/lshiftc.asm @@ -0,0 +1,242 @@ +dnl ARM Neon mpn_lshiftc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3.5 3.5 Y +C Cortex-A15 1.75 1.75 Y + + +C We read 64 bits at a time at 32-bit aligned addresses, and except for the +C first and last store, we write using 64-bit aligned addresses. All shifting +C is done on 64-bit words in 'extension' registers. +C +C It should be possible to read also using 64-bit alignment, by manipulating +C the shift count for unaligned operands. Not done, since it does not seem to +C matter for A9 or A15. +C +C This will not work in big-endian mode. + +C TODO +C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts, +C which might make it tricky. +C * Clean up and simplify. +C * Consider sharing most of the code for lshift and rshift, since the feed-in +C code, the loop, and most of the wind-down code are identical. +C * Replace the basecase code with code using 'extension' registers. +C * Optimise. It is not clear that this loop insn permutation is optimal for +C either A9 or A15. + +C INPUT PARAMETERS +define(`rp', `r0') +define(`ap', `r1') +define(`n', `r2') +define(`cnt', `r3') + +ASM_START(neon) + TEXT + ALIGN(64) +PROLOGUE(mpn_lshiftc) + mov r12, n, lsl #2 + add rp, rp, r12 + add ap, ap, r12 + + cmp n, #4 C SIMD code n limit + ble L(base) + + vdup.32 d6, r3 C left shift count is positive + sub r3, r3, #64 C right shift count is negative + vdup.32 d7, r3 + mov r12, #-8 C lshift pointer update offset + + sub ap, ap, #8 + vld1.32 {d19}, [ap], r12 C load initial 2 limbs + vshl.u64 d18, d19, d7 C retval + + tst rp, #4 C is rp 64-bit aligned already? + beq L(rp_aligned) C yes, skip + vmvn d19, d19 + add ap, ap, #4 C move back ap pointer + vshl.u64 d4, d19, d6 + sub n, n, #1 C first limb handled + sub rp, rp, #4 + vst1.32 {d4[1]}, [rp] C store first limb, rp gets aligned + vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2] + +L(rp_aligned): + sub rp, rp, #8 + subs n, n, #6 + vmvn d19, d19 + blt L(two_or_three_more) + tst n, #2 + beq L(2) + +L(1): vld1.32 {d17}, [ap], r12 + vshl.u64 d5, d19, d6 + vmvn d17, d17 + vld1.32 {d16}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + sub n, n, #2 + b L(mid) + +L(2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vmvn d16, d16 + vld1.32 {d17}, [ap], r12 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + subs n, n, #4 + blt L(end) + +L(top): vmvn d17, d17 + vld1.32 {d16}, [ap], r12 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 +L(mid): vmvn d16, d16 + vld1.32 {d17}, [ap], r12 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + subs n, n, #4 + bge L(top) + +L(end): tst n, #1 + beq L(evn) + + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + b L(cj1) + +L(evn): vmvn d17, d17 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 + vmov.u8 d17, #255 + vorr d2, d5, d0 + vshl.u64 d0, d17, d7 + vorr d3, d4, d0 + b L(cj2) + +C Load last 2 - 3 limbs, store last 4 - 5 limbs +L(two_or_three_more): + tst n, #1 + beq L(l2) + +L(l3): vshl.u64 d5, d19, d6 + vld1.32 {d17}, [ap], r12 +L(cj1): vmov.u8 d16, #0 + add ap, ap, #4 + vmvn d17, d17 + vld1.32 {d16[1]}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vmvn d16, d16 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + add rp, rp, #4 + vst1.32 {d5[1]}, [rp] + vmov.32 r0, d18[0] + bx lr + +L(l2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vmvn d16, d16 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vmov.u8 d17, #255 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vorr d3, d5, d0 +L(cj2): vst1.32 {d2}, [rp:64], r12 + vst1.32 {d3}, [rp] + vmov.32 r0, d18[0] + bx lr + + +define(`tnc', `r12') +L(base): + push {r4, r6, r7, r8} + ldr r4, [ap, #-4]! + rsb tnc, cnt, #32 + mvn r6, r4 + + mov r7, r6, lsl cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #-4]! + mvn r8, r8 + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #-4]! + mvn r6, r6 + subs n, n, #2 + beq L(ed) C n = 3 + C n = 4 +L(tp): ldr r8, [ap, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mvn r8, r8 + mov r7, r6, lsl cnt +L(md): ldr r6, [ap, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mvn r6, r6 + mov r7, r8, lsl cnt + +L(ed): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(ed1): mvn r6, #0 + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4] + mov r0, r4, lsr tnc + pop {r4, r6, r7, r8} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/popcount.asm b/gmp-6.3.0/mpn/arm/neon/popcount.asm new file mode 100644 index 0000000..2f8f9af --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/popcount.asm @@ -0,0 +1,166 @@ +dnl ARM Neon mpn_popcount -- mpn bit population count. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.125 +C Cortex-A15 0.56 + +C TODO +C * Explore using vldr and vldm. Does it help on A9? (These loads do +C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for +C popcount. Except perhaps also for popcount for the edge loads.) +C * Arrange to align the pointer, if that helps performance. Use the same +C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry +C valgrind!) +C * Explore if explicit align directives, e.g., "[ptr:128]" help. +C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. + +C INPUT PARAMETERS +define(`ap', r0) +define(`n', r1) + +C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end +C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or +C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which +C can be represented as a 8-bit ARM constant. +C +define(`chunksize',0x3f80) + +ASM_START() +PROLOGUE(mpn_popcount) + + cmp n, #chunksize + bhi L(gt16k) + +L(lt16k): + vmov.i64 q8, #0 C clear summation register + vmov.i64 q9, #0 C clear summation register + + tst n, #1 + beq L(xxx0) + vmov.i64 d0, #0 + sub n, n, #1 + vld1.32 {d0[0]}, [ap]! C load 1 limb + vcnt.8 d24, d0 + vpadal.u8 d16, d24 C d16/q8 = 0; could just splat + +L(xxx0):tst n, #2 + beq L(xx00) + sub n, n, #2 + vld1.32 {d0}, [ap]! C load 2 limbs + vcnt.8 d24, d0 + vpadal.u8 d16, d24 + +L(xx00):tst n, #4 + beq L(x000) + sub n, n, #4 + vld1.32 {q0}, [ap]! C load 4 limbs + vcnt.8 q12, q0 + vpadal.u8 q8, q12 + +L(x000):tst n, #8 + beq L(0000) + + subs n, n, #8 + vld1.32 {q0,q1}, [ap]! C load 8 limbs + bls L(sum) + +L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs + sub n, n, #8 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + b L(mid) + +L(0000):subs n, n, #16 + blo L(e0) + + vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vcnt.8 q12, q2 + vcnt.8 q13, q3 + subs n, n, #16 + blo L(end) + +L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vpadal.u8 q8, q12 + vcnt.8 q12, q0 + vpadal.u8 q9, q13 + vcnt.8 q13, q1 +L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs + subs n, n, #16 + vpadal.u8 q8, q12 + vcnt.8 q12, q2 + vpadal.u8 q9, q13 + vcnt.8 q13, q3 + bhs L(top) + +L(end): vpadal.u8 q8, q12 + vpadal.u8 q9, q13 +L(sum): vcnt.8 q12, q0 + vcnt.8 q13, q1 + vpadal.u8 q8, q12 + vpadal.u8 q9, q13 + vadd.i16 q8, q8, q9 + C we have 8 16-bit counts +L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts + vpaddl.u32 q8, q8 C we have 2 64-bit counts + vmov.32 r0, d16[0] + vmov.32 r1, d17[0] + add r0, r0, r1 + bx lr + +C Code for large count. Splits operand and calls above code. +define(`ap2', r2) C caller-saves reg not used above +L(gt16k): + push {r4,r14} + mov ap2, ap + mov r3, n C full count + mov r4, #0 C total sum + +1: mov n, #chunksize C count for this invocation + bl L(lt16k) C could jump deep inside code + add ap2, ap2, #chunksize*4 C point at next chunk + add r4, r4, r0 + mov ap, ap2 C put chunk pointer in place for call + sub r3, r3, #chunksize + cmp r3, #chunksize + bhi 1b + + mov n, r3 C count for final invocation + bl L(lt16k) + add r0, r4, r0 + pop {r4,pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm b/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm new file mode 100644 index 0000000..69fceb0 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm @@ -0,0 +1,140 @@ +dnl ARM Neon mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.15 +C Cortex-A15 0.65 + +define(`rp', `r0') +define(`tp', `r1') +define(`n', `r2') +define(`nents', `r3') +C define(`which', on stack) + +define(`i', `r4') +define(`j', `r5') + +define(`maskq', `q10') +define(`maskd', `d20') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + push {r4-r5} + + add r4, sp, #8 + vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies + vmov.i32 q14, #1 C 4 copies of 1 + + subs j, n, #8 + bmi L(outer_end) + +L(outer_top): + mov i, nents + mov r12, tp C preserve tp + veor q13, q13, q13 C 4 counter copies + veor q2, q2, q2 + veor q3, q3, q3 + ALIGN(16) +L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies + vld1.32 {q0,q1}, [tp] + vadd.i32 q13, q13, q14 + vbit q2, q0, maskq + vbit q3, q1, maskq + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(top) + vst1.32 {q2,q3}, [rp]! + add tp, r12, #32 C restore tp, point to next slice + subs j, j, #8 + bpl L(outer_top) +L(outer_end): + + tst n, #4 + beq L(b0xx) +L(b1xx):mov i, nents + mov r12, tp + veor q13, q13, q13 + veor q2, q2, q2 + ALIGN(16) +L(tp4): vceq.i32 maskq, q13, q15 + vld1.32 {q0}, [tp] + vadd.i32 q13, q13, q14 + vbit q2, q0, maskq + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp4) + vst1.32 {q2}, [rp]! + add tp, r12, #16 + +L(b0xx):tst n, #2 + beq L(b00x) +L(b01x):mov i, nents + mov r12, tp + veor d26, d26, d26 + veor d4, d4, d4 + ALIGN(16) +L(tp2): vceq.i32 maskd, d26, d30 + vld1.32 {d0}, [tp] + vadd.i32 d26, d26, d28 + vbit d4, d0, maskd + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp2) + vst1.32 {d4}, [rp]! + add tp, r12, #8 + +L(b00x):tst n, #1 + beq L(b000) +L(b001):mov i, nents + mov r12, tp + veor d26, d26, d26 + veor d4, d4, d4 + ALIGN(16) +L(tp1): vceq.i32 maskd, d26, d30 + vld1.32 {d0[0]}, [tp] + vadd.i32 d26, d26, d28 + vbit d4, d0, maskd + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp1) + vst1.32 {d4[0]}, [rp] + +L(b000):pop {r4-r5} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/rsh1aors_n.asm b/gmp-6.3.0/mpn/arm/rsh1aors_n.asm new file mode 100644 index 0000000..f2e3006 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/rsh1aors_n.asm @@ -0,0 +1,124 @@ +dnl ARM mpn_rsh1add_n and mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.64-3.7 +C Cortex-A15 2.5 + +C TODO +C * Not optimised. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`RSTCY', `cmn $1, $1') + define(`func', mpn_rsh1add_n) + define(`func_nc', mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`RSTCY', + `mvn $2, #0x80000000 + cmp $2, $1') + define(`func', mpn_rsh1sub_n) + define(`func_nc', mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + push {r4-r11} + ldr r4, [up], #4 + ldr r8, [vp], #4 + ADDSUB r4, r4, r8 + movs r12, r7, rrx + and r11, r4, #1 C return value + subs n, n, #4 + blo L(end) + +L(top): ldmia up!, {r5,r6,r7} + ldmia vp!, {r8,r9,r10} + cmn r12, r12 + ADDSUBC r5, r5, r8 + ADDSUBC r6, r6, r9 + ADDSUBC r7, r7, r10 + movs r12, r7, rrx + movs r6, r6, rrx + movs r5, r5, rrx + movs r4, r4, rrx + subs n, n, #3 + stmia rp!, {r4,r5,r6} + mov r4, r7 + bhs L(top) + +L(end): cmn n, #2 + bls L(e2) + ldm up, {r5,r6} + ldm vp, {r8,r9} + cmn r12, r12 + ADDSUBC r5, r5, r8 + ADDSUBC r6, r6, r9 + movs r12, r6, rrx + movs r5, r5, rrx + movs r4, r4, rrx + stmia rp!, {r4,r5} + mov r4, r6 + b L(e1) + +L(e2): bne L(e1) + ldr r5, [up, #0] + ldr r8, [vp, #0] + cmn r12, r12 + ADDSUBC r5, r5, r8 + movs r12, r5, rrx + movs r4, r4, rrx + str r4, [rp], #4 + mov r4, r5 + +L(e1): RSTCY( r12, r1) + mov r4, r4, rrx + str r4, [rp, #0] + mov r0, r11 + pop {r4-r11} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/rshift.asm b/gmp-6.3.0/mpn/arm/rshift.asm new file mode 100644 index 0000000..9ddbc2e --- /dev/null +++ b/gmp-6.3.0/mpn/arm/rshift.asm @@ -0,0 +1,86 @@ +dnl ARM mpn_rshift. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.5 +C Cortex-A15 ? + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`cnt', `r3') +define(`tnc', `r12') + +ASM_START() +PROLOGUE(mpn_rshift) + push {r4, r6, r7, r8} + ldr r4, [up] + rsb tnc, cnt, #32 + + mov r7, r4, lsr cnt + tst n, #1 + beq L(evn) C n even + +L(odd): subs n, n, #2 + bcc L(1) C n = 1 + ldr r8, [up, #4]! + b L(mid) + +L(evn): ldr r6, [up, #4]! + subs n, n, #2 + beq L(end) + +L(top): ldr r8, [up, #4]! + orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(mid): ldr r6, [up, #4]! + orr r7, r7, r8, lsl tnc + str r7, [rp], #4 + mov r7, r8, lsr cnt + subs n, n, #2 + bgt L(top) + +L(end): orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(1): str r7, [rp] + mov r0, r4, lsl tnc + pop {r4, r6, r7, r8} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/sec_tabselect.asm b/gmp-6.3.0/mpn/arm/sec_tabselect.asm new file mode 100644 index 0000000..76a412b --- /dev/null +++ b/gmp-6.3.0/mpn/arm/sec_tabselect.asm @@ -0,0 +1,131 @@ +dnl ARM mpn_sec_tabselect + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 2.33 +C Cortex-A15 2.2 + +C TODO +C * Consider using special code for small nents, either swapping the inner and +C outer loops, or providing a few completely unrolling the inner loops. + +define(`rp', `r0') +define(`tp', `r1') +define(`n', `r2') +define(`nents', `r3') +C which on stack + +define(`i', `r11') +define(`j', `r12') +define(`c', `r14') +define(`mask', `r7') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + push {r4-r11, r14} + + subs j, n, #3 + bmi L(outer_end) +L(outer_top): + ldr c, [sp, #36] + mov i, nents + push {tp} + + mov r8, #0 + mov r9, #0 + mov r10, #0 + +L(top): subs c, c, #1 + ldm tp, {r4,r5,r6} + sbc mask, mask, mask + subs i, i, #1 + add tp, tp, n, lsl #2 + and r4, r4, mask + and r5, r5, mask + and r6, r6, mask + orr r8, r8, r4 + orr r9, r9, r5 + orr r10, r10, r6 + bge L(top) + + stmia rp!, {r8,r9,r10} + pop {tp} + add tp, tp, #12 + subs j, j, #3 + bpl L(outer_top) +L(outer_end): + + cmp j, #-1 + bne L(n2) + + ldr c, [sp, #36] + mov i, nents + mov r8, #0 + mov r9, #0 +L(tp2): subs c, c, #1 + sbc mask, mask, mask + ldm tp, {r4,r5} + subs i, i, #1 + add tp, tp, n, lsl #2 + and r4, r4, mask + and r5, r5, mask + orr r8, r8, r4 + orr r9, r9, r5 + bge L(tp2) + stmia rp, {r8,r9} + pop {r4-r11, r14} + return lr + +L(n2): cmp j, #-2 + bne L(n1) + + ldr c, [sp, #36] + mov i, nents + mov r8, #0 +L(tp1): subs c, c, #1 + sbc mask, mask, mask + ldr r4, [tp] + subs i, i, #1 + add tp, tp, n, lsl #2 + and r4, r4, mask + orr r8, r8, r4 + bge L(tp1) + str r8, [rp] +L(n1): pop {r4-r11, r14} + return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/udiv.asm b/gmp-6.3.0/mpn/arm/udiv.asm new file mode 100644 index 0000000..7c04789 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/udiv.asm @@ -0,0 +1,104 @@ +dnl ARM mpn_udiv_qrnnd -- divide a two limb dividend and a one limb divisor. +dnl Return quotient and store remainder through a supplied pointer. + +dnl Copyright 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rem_ptr',`r0') +define(`n1',`r1') +define(`n0',`r2') +define(`d',`r3') + +C divstep -- develop one quotient bit. Dividend in $1$2, divisor in $3. +C Quotient bit is shifted into $2. +define(`divstep', + `adcs $2, $2, $2 + adc $1, $1, $1 + cmp $1, $3 + subcs $1, $1, $3') + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + mov r12, #8 C loop counter for both loops below + cmp d, #0x80000000 C check divisor msb and clear carry + bcs L(_large_divisor) + +L(oop): divstep(n1,n0,d) + divstep(n1,n0,d) + divstep(n1,n0,d) + divstep(n1,n0,d) + sub r12, r12, #1 + teq r12, #0 + bne L(oop) + + str n1, [rem_ptr] C store remainder + adc r0, n0, n0 C quotient: add last carry from divstep + return lr + +L(_large_divisor): + stmfd sp!, { r8, lr } + + and r8, n0, #1 C save lsb of dividend + mov lr, n1, lsl #31 + orrs n0, lr, n0, lsr #1 C n0 = lo(n1n0 >> 1) + mov n1, n1, lsr #1 C n1 = hi(n1n0 >> 1) + + and lr, d, #1 C save lsb of divisor + movs d, d, lsr #1 C d = floor(orig_d / 2) + adc d, d, #0 C d = ceil(orig_d / 2) + +L(oop2): + divstep(n1,n0,d) + divstep(n1,n0,d) + divstep(n1,n0,d) + divstep(n1,n0,d) + sub r12, r12, #1 + teq r12, #0 + bne L(oop2) + + adc n0, n0, n0 C shift and add last carry from divstep + add n1, r8, n1, lsl #1 C shift in omitted dividend lsb + tst lr, lr C test saved divisor lsb + beq L(_even_divisor) + + rsb d, lr, d, lsl #1 C restore orig d value + adds n1, n1, n0 C fix remainder for omitted divisor lsb + addcs n0, n0, #1 C adjust quotient if rem. fix carried + subcs n1, n1, d C adjust remainder accordingly + cmp n1, d C remainder >= divisor? + subcs n1, n1, d C adjust remainder + addcs n0, n0, #1 C adjust quotient + +L(_even_divisor): + str n1, [rem_ptr] C store remainder + mov r0, n0 C quotient + ldmfd sp!, { r8, pc } +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/arm/v5/gcd_11.asm b/gmp-6.3.0/mpn/arm/v5/gcd_11.asm new file mode 100644 index 0000000..3c2b48f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v5/gcd_11.asm @@ -0,0 +1,70 @@ +dnl ARM v5 mpn_gcd_11. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C StrongARM - +C XScale ? +C Cortex-A5 6.45 obsolete +C Cortex-A7 6.41 obsolete +C Cortex-A8 5.0 obsolete +C Cortex-A9 5.9 obsolete +C Cortex-A15 4.40 obsolete +C Cortex-A17 5.68 obsolete +C Cortex-A53 4.37 obsolete +C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1 + +define(`u0', `r0') +define(`v0', `r1') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + subs r3, u0, v0 C 0 + beq L(end) C + + ALIGN(16) +L(top): sub r2, v0, u0 C 0,5 + and r12, r2, r3 C 1 + clz r12, r12 C 2 + rsb r12, r12, #31 C 3 + rsbcc r3, r3, #0 C v = abs(u-v), even 1 + movcs u0, v0 C u = min(u,v) 1 + lsr v0, r3, r12 C 4 + subs r3, u0, v0 C 5 + bne L(top) C + +L(end): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v5/gcd_22.asm b/gmp-6.3.0/mpn/arm/v5/gcd_22.asm new file mode 100644 index 0000000..0643b7c --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v5/gcd_22.asm @@ -0,0 +1,117 @@ +dnl ARM v5 mpn_gcd_22. + +dnl Copyright 2019, 2022 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C StrongARM - +C XScale - +C ARM11 13 +C Cortex-A5 ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 ? +C Cortex-A12 ? +C Cortex-A15 ? +C Cortex-A17 ? +C Cortex-A53 ? + + +define(`gp', `r0') + +define(`u1', `r1') +define(`u0', `r2') +define(`v1', `r3') +define(`v0', `r4') + +define(`t0', `r5') +define(`t1', `r6') +define(`cnt', `r7') + +ASM_START() +PROLOGUE(mpn_gcd_22) + push { r4-r7 } + + ldr v0, [sp,#16] C + +L(top): subs t0, u0, v0 C 0 7 + beq L(lowz) + sbcs t1, u1, v1 C 1 8 + + sub cnt, v0, u0 + and cnt, cnt, t0 + + negcc t0, t0 + mvncc t1, t1 +L(bck): movcc v0, u0 + movcc v1, u1 + + clz r12, cnt C 2 + rsb cnt, r12, #31 C 3 + add r12, r12, #1 + + lsr u0, t0, cnt C 3 + lsl r12, t1, r12 C 4 + lsr u1, t1, cnt C 3 + orr u0, u0, r12 C 5 + + orrs r12, u1, v1 + bne L(top) + + + str r12, [gp,#4] C high result limb <= 0 + + mov r6, gp + mov r0, u0 C pass 1st argument + mov r1, v0 C pass 2nd argument + mov r7, r14 C preserve link register + bl mpn_gcd_11 + str r0, [r6,#0] + mov r14, r7 + pop { r4-r7 } + bx r14 + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subs t0, u1, v1 + beq L(end) + mov t1, #0 + sub cnt, v1, u1 + and cnt, cnt, t0 + negcc t0, t0 + b L(bck) + +L(end): str v0, [gp,#0] + str v1, [gp,#4] + pop { r4-r7 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v5/mod_1_1.asm b/gmp-6.3.0/mpn/arm/v5/mod_1_1.asm new file mode 100644 index 0000000..3cf0cd7 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v5/mod_1_1.asm @@ -0,0 +1,129 @@ +dnl ARM mpn_mod_1_1p + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 7 +C Cortex-A15 6 + +define(`ap', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cps',`r3') + +ASM_START() +PROLOGUE(mpn_mod_1_1p) + push {r4-r10} + add r0, r0, r1, asl #2 + ldr r5, [r0, #-4]! + ldr r12, [r0, #-4]! + subs r1, r1, #2 + ble L(4) + ldr r8, [r3, #12] + mov r4, r12 + mov r10, r5 + umull r7, r5, r10, r8 + sub r1, r1, #1 + b L(mid) + +L(top): adds r12, r6, r7 + adcs r10, r4, r5 + sub r1, r1, #1 + mov r6, #0 + movcs r6, r8 + umull r7, r5, r10, r8 + adds r4, r12, r6 + subcs r4, r4, r2 +L(mid): ldr r6, [r0, #-4]! + teq r1, #0 + bne L(top) + + adds r12, r6, r7 + adcs r5, r4, r5 + subcs r5, r5, r2 +L(4): ldr r1, [r3, #4] + cmp r1, #0 + beq L(7) + ldr r4, [r3, #8] + umull r0, r6, r5, r4 + adds r12, r0, r12 + addcs r6, r6, #1 + rsb r0, r1, #32 + mov r0, r12, lsr r0 + orr r5, r0, r6, asl r1 + mov r12, r12, asl r1 + b L(8) +L(7): cmp r5, r2 + subcs r5, r5, r2 +L(8): ldr r0, [r3, #0] + umull r4, r3, r5, r0 + add r5, r5, #1 + adds r0, r4, r12 + adc r5, r3, r5 + mul r5, r2, r5 + sub r12, r12, r5 + cmp r12, r0 + addhi r12, r12, r2 + cmp r2, r12 + subls r12, r12, r2 + mov r0, r12, lsr r1 + pop {r4-r10} + bx r14 +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps) + stmfd sp!, {r4, r5, r6, r14} + mov r5, r0 + clz r4, r1 + mov r0, r1, asl r4 + rsb r6, r0, #0 + bl mpn_invert_limb + str r0, [r5, #0] + str r4, [r5, #4] + cmp r4, #0 + beq L(2) + rsb r1, r4, #32 + mov r3, #1 + mov r3, r3, asl r4 + orr r3, r3, r0, lsr r1 + mul r3, r6, r3 + mov r4, r3, lsr r4 + str r4, [r5, #8] +L(2): mul r0, r6, r0 + str r0, [r5, #12] + ldmfd sp!, {r4, r5, r6, pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v5/mod_1_2.asm b/gmp-6.3.0/mpn/arm/v5/mod_1_2.asm new file mode 100644 index 0000000..aa26ecb --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v5/mod_1_2.asm @@ -0,0 +1,156 @@ +dnl ARM mpn_mod_1s_2p + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4.25 +C Cortex-A15 3 + +define(`ap', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cps',`r3') + +ASM_START() +PROLOGUE(mpn_mod_1s_2p) + push {r4-r10} + tst n, #1 + add r7, r3, #8 + ldmia r7, {r7, r8, r12} C load B1, B2, B3 + add ap, ap, n, lsl #2 C put ap at operand end + beq L(evn) + +L(odd): subs n, n, #1 + beq L(1) + ldmdb ap!, {r4,r6,r9} + mov r10, #0 + umlal r4, r10, r6, r7 + umlal r4, r10, r9, r8 + b L(com) + +L(evn): ldmdb ap!, {r4,r10} +L(com): subs n, n, #2 + ble L(end) + ldmdb ap!, {r5,r6} + b L(mid) + +L(top): mov r9, #0 + umlal r5, r9, r6, r7 C B1 + umlal r5, r9, r4, r8 C B2 + ldmdb ap!, {r4,r6} + umlal r5, r9, r10, r12 C B3 + ble L(xit) + mov r10, #0 + umlal r4, r10, r6, r7 C B1 + umlal r4, r10, r5, r8 C B2 + ldmdb ap!, {r5,r6} + umlal r4, r10, r9, r12 C B3 +L(mid): subs n, n, #4 + bge L(top) + + mov r9, #0 + umlal r5, r9, r6, r7 C B1 + umlal r5, r9, r4, r8 C B2 + umlal r5, r9, r10, r12 C B3 + mov r4, r5 + +L(end): movge r9, r10 C executed iff coming via xit + ldr r6, [r3, #4] C cps[1] = cnt + mov r5, #0 + umlal r4, r5, r9, r7 + mov r7, r5, lsl r6 +L(x): rsb r1, r6, #32 + orr r8, r7, r4, lsr r1 + mov r9, r4, lsl r6 + ldr r5, [r3, #0] + add r0, r8, #1 + umull r12, r1, r8, r5 + adds r4, r12, r9 + adc r1, r1, r0 + mul r5, r2, r1 + sub r9, r9, r5 + cmp r9, r4 + addhi r9, r9, r2 + cmp r2, r9 + subls r9, r9, r2 + mov r0, r9, lsr r6 + pop {r4-r10} + bx r14 + +L(xit): mov r10, #0 + umlal r4, r10, r6, r7 C B1 + umlal r4, r10, r5, r8 C B2 + umlal r4, r10, r9, r12 C B3 + b L(end) + +L(1): ldr r6, [r3, #4] C cps[1] = cnt + ldr r4, [ap, #-4] C ap[0] + mov r7, #0 + b L(x) +EPILOGUE() + +PROLOGUE(mpn_mod_1s_2p_cps) + push {r4-r8, r14} + clz r4, r1 + mov r5, r1, lsl r4 C b <<= cnt + mov r6, r0 C r6 = cps + mov r0, r5 + bl mpn_invert_limb + rsb r3, r4, #32 + mov r3, r0, lsr r3 + mov r2, #1 + orr r3, r3, r2, lsl r4 + rsb r1, r5, #0 + mul r2, r1, r3 + umull r3, r12, r2, r0 + add r12, r2, r12 + mvn r12, r12 + mul r1, r5, r12 + cmp r1, r3 + addhi r1, r1, r5 + umull r12, r7, r1, r0 + add r7, r1, r7 + mvn r7, r7 + mul r3, r5, r7 + cmp r3, r12 + addhi r3, r3, r5 + mov r5, r2, lsr r4 + mov r7, r1, lsr r4 + mov r8, r3, lsr r4 + stmia r6, {r0,r4,r5,r7,r8} C fill cps + pop {r4-r8, pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_1.asm b/gmp-6.3.0/mpn/arm/v6/addmul_1.asm new file mode 100644 index 0000000..a38af58 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/addmul_1.asm @@ -0,0 +1,112 @@ +dnl ARM mpn_addmul_1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 6.4 +C Cortex-A7 5.25 +C Cortex-A8 7 +C Cortex-A9 3.25 +C Cortex-A15 4 + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_addmul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, #0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #4 + ldr r6, [rp, #0] + ldr r5, [up], #4 + b L(lo3) + +L(fi0): ldr r5, [up], #4 + ldr r7, [rp], #4 + ldr r4, [up], #4 + b L(lo0) + +L(fi1): ldr r4, [up], #4 + ldr r6, [rp], #8 + subs n, n, #1 + beq L(1) + ldr r5, [up], #4 + b L(lo1) + +L(fi2): ldr r5, [up], #4 + ldr r7, [rp], #12 + ldr r4, [up], #4 + b L(lo2) + + ALIGN(16) +L(top): ldr r6, [rp, #-8] + ldr r5, [up], #4 + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + ldr r7, [rp, #-4] + ldr r4, [up], #4 + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + ldr r6, [rp, #0] + ldr r5, [up], #4 + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + ldr r7, [rp, #4] + ldr r4, [up], #4 + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + subs n, n, #4 + bhi L(top) + + ldr r6, [rp, #-8] + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + mov r0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_2.asm b/gmp-6.3.0/mpn/arm/v6/addmul_2.asm new file mode 100644 index 0000000..69d0b8f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/addmul_2.asm @@ -0,0 +1,125 @@ +dnl ARM mpn_addmul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 4.68 +C Cortex-A5 3.63 +C Cortex-A7 3.65 +C Cortex-A8 4.0 +C Cortex-A9 2.25 +C Cortex-A15 2.5 +C Cortex-A17 2.13 +C Cortex-A53 3.5 + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r6') +define(`v1',`r7') +define(`u0',`r3') +define(`u1',`r9') + +define(`cya',`r8') +define(`cyb',`r12') + + +ASM_START() +PROLOGUE(mpn_addmul_2) + push { r4-r9 } + + ldrd v0, v1, [vp, #0] + mov cya, #0 + mov cyb, #0 + + tst n, #1 + beq L(evn) + +L(odd): ldr u1, [up, #0] + ldr r4, [rp, #0] + tst n, #2 + beq L(fi1) +L(fi3): sub up, up, #8 + sub rp, rp, #8 + b L(lo3) +L(fi1): sub n, n, #1 + b L(top) + +L(evn): ldr u0, [up, #0] + ldr r5, [rp, #0] + tst n, #2 + bne L(fi2) +L(fi0): sub up, up, #4 + sub rp, rp, #4 + b L(lo0) +L(fi2): sub up, up, #12 + sub rp, rp, #12 + b L(lo2) + + ALIGN(16) +L(top): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] +L(lo0): ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] +L(lo3): ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] +L(lo2): ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs n, n, #4 + bhi L(top) + +L(end): umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + mov r0, cyb + + pop { r4-r9 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_3.asm b/gmp-6.3.0/mpn/arm/v6/addmul_3.asm new file mode 100644 index 0000000..d1490cd --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/addmul_3.asm @@ -0,0 +1,191 @@ +dnl ARM mpn_addmul_3. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 4.33 +C Cortex-A5 3.28 +C Cortex-A7 3.25 +C Cortex-A8 3.17 +C Cortex-A9 2.125 +C Cortex-A15 2 +C Cortex-A17 2.11 +C Cortex-A53 4.18 + +C TODO +C * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table, +C avoiding the current multiply. +C * Start the first multiply or multiplies early. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r4') define(`v1',`r5') define(`v2',`r6') +define(`u0',`r3') define(`u1',`r14') +define(`w0',`r7') define(`w1',`r8') define(`w2',`r9') +define(`cy0',`r10') define(`cy1',`r11') define(`cy2',`r12') + + +ASM_START() +PROLOGUE(mpn_addmul_3) + push { r4-r11, r14 } + + ldr w0, =0xaaaaaaab C 3^{-1} mod 2^32 + ldm vp, { v0,v1,v2 } + mov cy0, #0 + mov cy1, #0 + mov cy2, #0 + +C Tricky n mod 6 + mul w0, w0, n C n * 3^{-1} mod 2^32 + and w0, w0, #0xc0000001 C pseudo-CRT mod 3,2 + sub n, n, #3 +ifdef(`PIC',` + add pc, pc, w0, ror $28 + nop + b L(b0) + b L(b2) + b L(b4) + .word 0xe7f000f0 C udf + b L(b3) + b L(b5) + b L(b1) +',` + ldr pc, [pc, w0, ror $28] + nop + .word L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1) +') + +L(b5): add up, up, #-8 + ldr w1, [rp, #0] + ldr w2, [rp, #4] + ldr u1, [up, #8] + b L(lo5) + +L(b4): add rp, rp, #-4 + add up, up, #-12 + ldr w2, [rp, #4] + ldr w0, [rp, #8] + ldr u0, [up, #12] + b L(lo4) + +L(b3): add rp, rp, #-8 + add up, up, #-16 + ldr w0, [rp, #8] + ldr w1, [rp, #12] + ldr u1, [up, #16] + b L(lo3) + +L(b1): add rp, rp, #8 + ldr w2, [rp, #-8] + ldr w0, [rp, #-4] + ldr u1, [up, #0] + b L(lo1) + +L(b0): add rp, rp, #4 + add up, up, #-4 + ldr w0, [rp, #-4] + ldr w1, [rp, #0] + ldr u0, [up, #4] + b L(lo0) + +L(b2): add rp, rp, #12 + add up, up, #4 + ldr w1, [rp, #-12] + ldr w2, [rp, #-8] + ldr u0, [up, #-4] + + ALIGN(16) +L(top): ldr w0, [rp, #-4] + umaal w1, cy0, u0, v0 + ldr u1, [up, #0] + umaal w2, cy1, u0, v1 + str w1, [rp, #-12] + umaal w0, cy2, u0, v2 +L(lo1): ldr w1, [rp, #0] + umaal w2, cy0, u1, v0 + ldr u0, [up, #4] + umaal w0, cy1, u1, v1 + str w2, [rp, #-8] + umaal w1, cy2, u1, v2 +L(lo0): ldr w2, [rp, #4] + umaal w0, cy0, u0, v0 + ldr u1, [up, #8] + umaal w1, cy1, u0, v1 + str w0, [rp, #-4] + umaal w2, cy2, u0, v2 +L(lo5): ldr w0, [rp, #8] + umaal w1, cy0, u1, v0 + ldr u0, [up, #12] + umaal w2, cy1, u1, v1 + str w1, [rp, #0] + umaal w0, cy2, u1, v2 +L(lo4): ldr w1, [rp, #12] + umaal w2, cy0, u0, v0 + ldr u1, [up, #16] + umaal w0, cy1, u0, v1 + str w2, [rp, #4] + umaal w1, cy2, u0, v2 +L(lo3): ldr w2, [rp, #16] + umaal w0, cy0, u1, v0 + ldr u0, [up, #20] + umaal w1, cy1, u1, v1 + str w0, [rp, #8] + umaal w2, cy2, u1, v2 +L(lo2): subs n, n, #6 + add up, up, #24 + add rp, rp, #24 + bge L(top) + +L(end): umaal w1, cy0, u0, v0 + ldr u1, [up, #0] + umaal w2, cy1, u0, v1 + str w1, [rp, #-12] + mov w0, #0 + umaal w0, cy2, u0, v2 + umaal w2, cy0, u1, v0 + umaal w0, cy1, u1, v1 + str w2, [rp, #-8] + umaal cy1, cy2, u1, v2 + adds w0, w0, cy0 + str w0, [rp, #-4] + adcs w1, cy1, #0 + str w1, [rp, #0] + adc r0, cy2, #0 + + pop { r4-r11, pc } +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/dive_1.asm b/gmp-6.3.0/mpn/arm/v6/dive_1.asm new file mode 100644 index 0000000..92de814 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/dive_1.asm @@ -0,0 +1,149 @@ +dnl ARM v6 mpn_divexact_1 + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C norm unorm modexact_1c_odd +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 9 10 9 +C Cortex-A15 7 7 7 + +C Architecture requirements: +C v5 - +C v5t clz +C v5te - +C v6 umaal +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r10') + +ASM_START() +PROLOGUE(mpn_divexact_1) + push {r4,r5,r6,r7,r8,r9} + + tst d, #1 + + rsb r4, d, #0 + and r4, r4, d + clz r4, r4 + rsb cnt, r4, #31 C count_trailing_zeros + mov d, d, lsr cnt + +C binvert limb + LEA( r4, binvert_limb_table) + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + mul r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, lsl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, lsl #1 C r4 = inverse + + ldr r5, [up], #4 C up[0] + mov cy, #0 + rsb r8, r4, #0 C r8 = -inverse + beq L(unnorm) + +L(norm): + subs n, n, #1 + mul r5, r5, r4 + beq L(end) + + ALIGN(16) +L(top): ldr r9, [up], #4 + mov r12, #0 + str r5, [rp], #4 + umaal r12, cy, r5, d + mul r5, r9, r4 + mla r5, cy, r8, r5 + subs n, n, #1 + bne L(top) + +L(end): str r5, [rp] + pop {r4,r5,r6,r7,r8,r9} + bx r14 + +L(unnorm): + push {r10,r11} + rsb tnc, cnt, #32 + mov r11, r5, lsr cnt + subs n, n, #1 + beq L(edx) + + ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + mul r5, r9, r4 + subs n, n, #1 + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + mov r12, #0 + str r5, [rp], #4 + umaal r12, cy, r5, d + mul r5, r9, r4 + mla r5, cy, r8, r5 + subs n, n, #1 + bne L(tpu) + +L(edu): str r5, [rp], #4 + mov r12, #0 + umaal r12, cy, r5, d + mul r5, r11, r4 + mla r5, cy, r8, r5 + str r5, [rp] + pop {r10,r11} + pop {r4,r5,r6,r7,r8,r9} + bx r14 + +L(edx): mul r5, r11, r4 + str r5, [rp] + pop {r10,r11} + pop {r4,r5,r6,r7,r8,r9} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h new file mode 100644 index 0000000..35a7c55 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h @@ -0,0 +1,187 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 700 MHz ARM11 (raspberry pi) */ +/* FFT tuning limit = 8,088,775 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 19 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIV_QR_1N_PI1_METHOD 1 /* 71.61% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 38 + +#define DIV_1_VS_MUL_1_PERCENT 251 + +#define MUL_TOOM22_THRESHOLD 38 +#define MUL_TOOM33_THRESHOLD 134 +#define MUL_TOOM44_THRESHOLD 512 +#define MUL_TOOM6H_THRESHOLD 0 /* always */ +#define MUL_TOOM8H_THRESHOLD 620 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 209 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 625 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 209 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 300 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 55 +#define SQR_TOOM3_THRESHOLD 200 +#define SQR_TOOM4_THRESHOLD 470 +#define SQR_TOOM6_THRESHOLD 614 +#define SQR_TOOM8_THRESHOLD 882 + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 26 + +#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \ + { 351,11}, { 191,10}, { 399,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 575,12}, { 319,11}, { 671,12}, { 383,11}, \ + { 799,12}, { 447,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 703,13}, { 383,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1151,13}, { 639,12}, \ + { 1343,13}, { 767,12}, { 1599,13}, { 895,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 98 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 530, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 639,11}, { 351,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 607,12}, { 319,11}, { 703,12}, \ + { 383,11}, { 799,12}, { 447,11}, { 895,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 703,13}, \ + { 383,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1151,13}, { 639,12}, { 1343,13}, { 767,12}, \ + { 1599,13}, { 895,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 104 +#define SQR_FFT_THRESHOLD 4416 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 51 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 55 +#define SQRLO_SQR_THRESHOLD 8648 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 146 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 145 +#define INV_APPR_THRESHOLD 147 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_2_THRESHOLD 6 +#define REDC_2_TO_REDC_N_THRESHOLD 140 + +#define MU_DIV_QR_THRESHOLD 2801 +#define MU_DIVAPPR_Q_THRESHOLD 2801 +#define MUPI_DIV_QR_THRESHOLD 79 +#define MU_BDIV_QR_THRESHOLD 2541 +#define MU_BDIV_Q_THRESHOLD 2764 + +#define POWM_SEC_TABLE 3,20,139,734 + +#define GET_STR_DC_THRESHOLD 27 +#define GET_STR_PRECOMPUTE_THRESHOLD 45 +#define SET_STR_DC_THRESHOLD 342 +#define SET_STR_PRECOMPUTE_THRESHOLD 1290 + +#define FAC_DSC_THRESHOLD 390 +#define FAC_ODD_THRESHOLD 438 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 5 /* 1.32% faster than 3 */ +#define HGCD_THRESHOLD 82 +#define HGCD_APPR_THRESHOLD 81 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 345 +#define GCDEXT_DC_THRESHOLD 268 +#define JACOBI_BASE_METHOD 1 /* 3.30% faster than 2 */ + +/* Tuneup completed successfully, took 45018 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v6/mode1o.asm b/gmp-6.3.0/mpn/arm/v6/mode1o.asm new file mode 100644 index 0000000..a2f77a6 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/mode1o.asm @@ -0,0 +1,95 @@ +dnl ARM v6 mpn_modexact_1c_odd + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 9 +C Cortex-A15 7 + +C Architecture requirements: +C v5 - +C v5t - +C v5te smulbb +C v6 umaal +C v6t2 - +C v7a - + +define(`up', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cy', `r3') + + .protected binvert_limb_table +ASM_START() +PROLOGUE(mpn_modexact_1c_odd) + stmfd sp!, {r4, r5, r6, r7} + + LEA( r4, binvert_limb_table) + + ldr r6, [up], #4 C up[0] + + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + smulbb r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, asl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, asl #1 C r4 = inverse + + subs n, n, #1 + sub r6, r6, cy + mul r6, r6, r4 + beq L(end) + + rsb r5, r4, #0 C r5 = -inverse + +L(top): ldr r7, [up], #4 + mov r12, #0 + umaal r12, cy, r6, d + mul r6, r7, r4 + mla r6, cy, r5, r6 + subs n, n, #1 + bne L(top) + +L(end): mov r12, #0 + umaal r12, cy, r6, d + mov r0, cy + + ldmfd sp!, {r4, r5, r6, r7} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/mul_1.asm b/gmp-6.3.0/mpn/arm/v6/mul_1.asm new file mode 100644 index 0000000..3c6ef99 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/mul_1.asm @@ -0,0 +1,115 @@ +dnl ARM mpn_mul_1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 6.4 +C Cortex-A7 5.25 +C Cortex-A8 7 +C Cortex-A9 3.25 +C Cortex-A15 4 + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_mul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, #0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #4 + mov r6, #0 + ldr r5, [up], #4 + b L(lo3) + +L(fi0): ldr r5, [up], #4 + add rp, rp, #4 + mov r7, #0 + ldr r4, [up], #4 + b L(lo0) + +L(fi1): ldr r4, [up], #4 + mov r6, #0 + add rp, rp, #8 + subs n, n, #1 + beq L(1) + ldr r5, [up], #4 + b L(lo1) + +L(fi2): ldr r5, [up], #4 + add rp, rp, #12 + mov r7, #0 + ldr r4, [up], #4 + b L(lo2) + + ALIGN(16) +L(top): mov r6, #0 + ldr r5, [up], #4 + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + mov r7, #0 + ldr r4, [up], #4 + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + mov r6, #0 + ldr r5, [up], #4 + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + mov r7, #0 + ldr r4, [up], #4 + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + subs n, n, #4 + bhi L(top) + + mov r6, #0 + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + mov r0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/mul_2.asm b/gmp-6.3.0/mpn/arm/v6/mul_2.asm new file mode 100644 index 0000000..edd27f3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/mul_2.asm @@ -0,0 +1,135 @@ +dnl ARM mpn_mul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 5.25 +C Cortex-A5 3.63 +C Cortex-A7 3.15 +C Cortex-A8 5.0 +C Cortex-A9 2.25 +C Cortex-A15 2.5 +C Cortex-A17 2.13 +C Cortex-A53 3.5 + +C TODO +C * This is a trivial edit of the addmul_2 code. Check for simplifications, +C and possible speedups to 2.0 c/l. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r6') +define(`v1',`r7') +define(`u0',`r3') +define(`u1',`r9') + +define(`cya',`r8') +define(`cyb',`r12') + + +ASM_START() +PROLOGUE(mpn_mul_2) + push { r4, r5, r6, r7, r8, r9 } + + ldm vp, { v0, v1 } + mov cya, #0 + mov cyb, #0 + + tst n, #1 + beq L(evn) +L(odd): mov r5, #0 + ldr u0, [up, #0] + mov r4, #0 + tst n, #2 + beq L(fi1) +L(fi3): sub up, up, #12 + sub rp, rp, #16 + b L(lo3) +L(fi1): sub n, n, #1 + sub up, up, #4 + sub rp, rp, #8 + b L(lo1) +L(evn): mov r4, #0 + ldr u1, [up, #0] + mov r5, #0 + tst n, #2 + bne L(fi2) +L(fi0): sub up, up, #8 + sub rp, rp, #12 + b L(lo0) +L(fi2): subs n, n, #2 + sub rp, rp, #4 + bls L(end) + + ALIGN(16) +L(top): ldr u0, [up, #4] + umaal r4, cya, u1, v0 + str r4, [rp, #4] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(lo1): ldr u1, [up, #8] + umaal r5, cya, u0, v0 + str r5, [rp, #8] + mov r5, #0 + umaal r4, cyb, u0, v1 +L(lo0): ldr u0, [up, #12] + umaal r4, cya, u1, v0 + str r4, [rp, #12] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(lo3): ldr u1, [up, #16]! + umaal r5, cya, u0, v0 + str r5, [rp, #16]! + mov r5, #0 + umaal r4, cyb, u0, v1 + subs n, n, #4 + bhi L(top) + +L(end): umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #4] + umaal r5, cya, u0, v0 + umaal cya, cyb, u0, v1 + str r5, [rp, #8] + str cya, [rp, #12] + mov r0, cyb + + pop { r4, r5, r6, r7, r8, r9 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/popham.asm b/gmp-6.3.0/mpn/arm/v6/popham.asm new file mode 100644 index 0000000..c254368 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/popham.asm @@ -0,0 +1,139 @@ +dnl ARM mpn_popcount and mpn_hamdist. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C popcount hamdist +C cycles/limb cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 8.94 9.47 +C Cortex-A15 5.67 6.44 + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 usada8 +C v6t2 - +C v7a - + +ifdef(`OPERATION_popcount',` + define(`func',`mpn_popcount') + define(`ap', `r0') + define(`n', `r1') + define(`a0', `r2') + define(`a1', `r3') + define(`s', `r5') + define(`b_01010101', `r6') + define(`b_00110011', `r7') + define(`b_00001111', `r8') + define(`zero', `r9') + define(`POPC', `$1') + define(`HAMD', `dnl') +') +ifdef(`OPERATION_hamdist',` + define(`func',`mpn_hamdist') + define(`ap', `r0') + define(`bp', `r1') + define(`n', `r2') + define(`a0', `r6') + define(`a1', `r7') + define(`b0', `r4') + define(`b1', `r5') + define(`s', `r11') + define(`b_01010101', `r8') + define(`b_00110011', `r9') + define(`b_00001111', `r10') + define(`zero', `r3') + define(`POPC', `dnl') + define(`HAMD', `$1') +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + +ASM_START() +PROLOGUE(func) +POPC(` push { r4-r9 } ') +HAMD(` push { r4-r11 } ') + + ldr b_01010101, =0x55555555 + mov r12, #0 + ldr b_00110011, =0x33333333 + mov zero, #0 + ldr b_00001111, =0x0f0f0f0f + + tst n, #1 + beq L(evn) + +L(odd): ldr a1, [ap], #4 C 1 x 32 1-bit accumulators, 0-1 +HAMD(` ldr b1, [bp], #4 ') C 1 x 32 1-bit accumulators, 0-1 +HAMD(` eor a1, a1, b1 ') + and r4, b_01010101, a1, lsr #1 + sub a1, a1, r4 + and r4, a1, b_00110011 + bic r5, a1, b_00110011 + add r5, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + subs n, n, #1 + b L(mid) + +L(evn): mov s, #0 + +L(top): ldrd a0, a1, [ap], #8 C 2 x 32 1-bit accumulators, 0-1 +HAMD(` ldrd b0, b1, [bp], #8') +HAMD(` eor a0, a0, b0 ') +HAMD(` eor a1, a1, b1 ') + subs n, n, #2 + usada8 r12, s, zero, r12 + and r4, b_01010101, a0, lsr #1 + sub a0, a0, r4 + and r4, b_01010101, a1, lsr #1 + sub a1, a1, r4 + and r4, a0, b_00110011 + bic r5, a0, b_00110011 + add a0, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + and r4, a1, b_00110011 + bic r5, a1, b_00110011 + add a1, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + add r5, a0, a1 C 8 4-bit accumulators, 0-8 +L(mid): and r4, r5, b_00001111 + bic r5, r5, b_00001111 + add s, r4, r5, lsr #4 C 4 8-bit accumulators + bne L(top) + + usada8 r0, s, zero, r12 +POPC(` pop { r4-r9 } ') +HAMD(` pop { r4-r11 } ') + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm b/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm new file mode 100644 index 0000000..0fc4f13 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm @@ -0,0 +1,544 @@ +dnl ARM v6 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Code structure: +C +C +C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) +C | | | | +C | | | | +C | | | | +C \|/ \|/ \|/ \|/ +C ____________ ____________ +C / \ / \ +C \|/ \ \|/ \ +C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) +C \ /|\ \ /|\ +C \____________/ \____________/ +C \ / +C \ / +C \ / +C cor3 cor2 +C \ / +C \ / +C sqr_diag_addlsh1 + +C TODO +C * Align more labels. +C * Further tweak counter and updates in outer loops. (This could save +C perhaps 5n cycles). +C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then +C initialise loop counter i with a right shift. +C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved. +C (This could save 2-3 cycles for n > 4.) +C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry +C propagation. +C * Stop loops earlier suppressing writes of upper-most rp[] values. +C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly +C particularly on Cortex-A8. + + +define(`rp', r0) +define(`up', r1) +define(`n', r2) + +define(`v0', r3) +define(`v1', r6) +define(`i', r8) +define(`n_saved', r14) +define(`cya', r11) +define(`cyb', r12) +define(`u0', r7) +define(`u1', r9) + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + and r12, n, #3 + cmp n, #4 + addgt r12, r12, #4 + add pc, pc, r12, lsl #2 + nop + b L(4) + b L(1) + b L(2) + b L(3) + b L(0m4) + b L(1m4) + b L(2m4) + b L(3m4) + + +L(1m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_2m4)-.-8 + ldm up, {v0,v1,u0} + sub up, up, #4 + mov cyb, #0 + mov r5, #0 + umull r4, cya, v1, v0 + str r4, [rp], #-12 + mov r4, #0 + b L(ko0) + +L(3m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_0m4)-.-8 + ldm up, {v0,v1,u0} + add up, up, #4 + mov cyb, #0 + mov r5, #0 + umull r4, cya, v1, v0 + str r4, [rp], #-4 + mov r4, #0 + b L(ko2) + +L(2m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_3m4)-.-8 + ldm up, {v0,v1,u1} + mov cyb, #0 + mov r4, #0 + umull r5, cya, v1, v0 + str r5, [rp], #-8 + mov r5, #0 + b L(ko1) + +L(0m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_1m4)-.-8 + ldm up, {v0,v1,u1} + mov cyb, #0 + mov r4, #0 + add up, up, #8 + umull r5, cya, v1, v0 + str r5, [rp, #0] + mov r5, #0 + +L(top): ldr u0, [up, #4] + umaal r4, cya, u1, v0 + str r4, [rp, #4] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(ko2): ldr u1, [up, #8] + umaal r5, cya, u0, v0 + str r5, [rp, #8] + mov r5, #0 + umaal r4, cyb, u0, v1 +L(ko1): ldr u0, [up, #12] + umaal r4, cya, u1, v0 + str r4, [rp, #12] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(ko0): ldr u1, [up, #16]! + umaal r5, cya, u0, v0 + str r5, [rp, #16]! + mov r5, #0 + umaal r4, cyb, u0, v1 + subs i, i, #4 + bhi L(top) + + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #4] + umaal r5, cya, u0, v0 + umaal cya, cyb, u0, v1 + str r5, [rp, #8] + str cya, [rp, #12] + str cyb, [rp, #16] + + add up, up, #4 + sub n, n, #1 + add rp, rp, #8 + bx r10 + +L(evnloop): + subs i, n, #6 + sub n, n, #2 + blt L(cor2) + ldm up, {v0,v1,u1} + add up, up, #8 + mov cya, #0 + mov cyb, #0 + ldr r4, [rp, #-4] + umaal r4, cya, v1, v0 + str r4, [rp, #-4] + ldr r4, [rp, #0] + + ALIGN(16) +L(ua2): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua2) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_0m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #8 + + sub i, n, #4 + sub n, n, #2 + ldm up, {v0,v1,u1} + mov cya, #0 + mov cyb, #0 + ldr r4, [rp, #4] + umaal r4, cya, v1, v0 + str r4, [rp, #4] + ldr r4, [rp, #8] + b L(lo0) + + ALIGN(16) +L(ua0): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] +L(lo0): ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua0) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_2m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #16 + b L(evnloop) + + +L(oddloop): + sub i, n, #5 + sub n, n, #2 + ldm up, {v0,v1,u0} + mov cya, #0 + mov cyb, #0 + ldr r5, [rp, #0] + umaal r5, cya, v1, v0 + str r5, [rp, #0] + ldr r5, [rp, #4] + add up, up, #4 + b L(lo1) + + ALIGN(16) +L(ua1): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] +L(lo1): ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua1) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_3m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #4 + + subs i, n, #3 + beq L(cor3) + sub n, n, #2 + ldm up, {v0,v1,u0} + mov cya, #0 + mov cyb, #0 + ldr r5, [rp, #8] + sub up, up, #4 + umaal r5, cya, v1, v0 + str r5, [rp, #8] + ldr r5, [rp, #12] + b L(lo3) + + ALIGN(16) +L(ua3): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] +L(lo3): ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua3) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_1m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #12 + b L(oddloop) + + +L(cor3):ldm up, {v0,v1,u0} + ldr r5, [rp, #8] + mov cya, #0 + mov cyb, #0 + umaal r5, cya, v1, v0 + str r5, [rp, #8] + ldr r5, [rp, #12] + ldr r4, [rp, #16] + umaal r5, cya, u0, v0 + ldr u1, [up, #12] + umaal r4, cyb, u0, v1 + str r5, [rp, #12] + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #16] + str cya, [rp, #20] + str cyb, [rp, #24] + add up, up, #16 + mov cya, cyb + adds rp, rp, #36 C clear cy + mov cyb, #0 + umaal cya, cyb, u1, u0 + b L(sqr_diag_addlsh1) + +L(cor2): + ldm up!, {v0,v1,u0} + mov r4, cya + mov r5, cyb + mov cya, #0 + umaal r4, cya, v1, v0 + mov cyb, #0 + umaal r5, cya, u0, v0 + strd r4, r5, [rp, #-4] + umaal cya, cyb, u0, v1 + add rp, rp, #16 +C b L(sqr_diag_addlsh1) + + +define(`w0', r6) +define(`w1', r7) +define(`w2', r8) +define(`rbx', r9) + +L(sqr_diag_addlsh1): + str cya, [rp, #-12] + str cyb, [rp, #-8] + sub n, n_saved, #1 + sub up, up, n_saved, lsl #2 + sub rp, rp, n_saved, lsl #3 + ldr r3, [up], #4 + umull w1, r5, r3, r3 + mov w2, #0 + mov r10, #0 +C cmn r0, #0 C clear cy (already clear) + b L(lm) + +L(tsd): adds w0, w0, rbx + adcs w1, w1, r4 + str w0, [rp, #0] +L(lm): ldr w0, [rp, #4] + str w1, [rp, #4] + ldr w1, [rp, #8]! + add rbx, r5, w2 + adcs w0, w0, w0 + ldr r3, [up], #4 + adcs w1, w1, w1 + adc w2, r10, r10 + umull r4, r5, r3, r3 + subs n, n, #1 + bne L(tsd) + + adds w0, w0, rbx + adcs w1, w1, r4 + adc w2, r5, w2 + stm rp, {w0,w1,w2} + + pop {r4-r11, pc} + + +C Straight line code for n <= 4 + +L(1): ldr r3, [up, #0] + umull r1, r2, r3, r3 + stm rp, {r1,r2} + bx r14 + +L(2): push {r4-r5} + ldm up, {r5,r12} + umull r1, r2, r5, r5 + umull r3, r4, r12, r12 + umull r5, r12, r5, r12 + adds r5, r5, r5 + adcs r12, r12, r12 + adc r4, r4, #0 + adds r2, r2, r5 + adcs r3, r3, r12 + adc r4, r4, #0 + stm rp, {r1,r2,r3,r4} + pop {r4-r5} + bx r14 + +L(3): push {r4-r11} + ldm up, {r7,r8,r9} + umull r1, r2, r7, r7 + umull r3, r4, r8, r8 + umull r5, r6, r9, r9 + umull r10, r11, r7, r8 + mov r12, #0 + umlal r11, r12, r7, r9 + mov r7, #0 + umlal r12, r7, r8, r9 + adds r10, r10, r10 + adcs r11, r11, r11 + adcs r12, r12, r12 + adcs r7, r7, r7 + adc r6, r6, #0 + adds r2, r2, r10 + adcs r3, r3, r11 + adcs r4, r4, r12 + adcs r5, r5, r7 + adc r6, r6, #0 + stm rp, {r1,r2,r3,r4,r5,r6} + pop {r4-r11} + bx r14 + +L(4): push {r4-r11, r14} + ldm up, {r9,r10,r11,r12} + umull r1, r2, r9, r9 + umull r3, r4, r10, r10 + umull r5, r6, r11, r11 + umull r7, r8, r12, r12 + stm rp, {r1,r2,r3,r4,r5,r6,r7} + umull r1, r2, r9, r10 + mov r3, #0 + umlal r2, r3, r9, r11 + mov r4, #0 + umlal r3, r4, r9, r12 + mov r5, #0 + umlal r3, r5, r10, r11 + umaal r4, r5, r10, r12 + mov r6, #0 + umlal r5, r6, r11, r12 + adds r1, r1, r1 + adcs r2, r2, r2 + adcs r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + add rp, rp, #4 + adc r7, r8, #0 + ldm rp, {r8,r9,r10,r11,r12,r14} + adds r1, r1, r8 + adcs r2, r2, r9 + adcs r3, r3, r10 + adcs r4, r4, r11 + adcs r5, r5, r12 + adcs r6, r6, r14 + adc r7, r7, #0 + stm rp, {r1,r2,r3,r4,r5,r6,r7} + pop {r4-r11, pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/submul_1.asm b/gmp-6.3.0/mpn/arm/v6/submul_1.asm new file mode 100644 index 0000000..8a21733 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/submul_1.asm @@ -0,0 +1,125 @@ +dnl ARM mpn_submul_1. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.75 +C Cortex-A15 4.0 + +C This loop complements U on the fly, +C U' = B^n - 1 - U +C and then uses that +C R - U*v = R + U'*v + v - B^n v + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_submul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, v0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #12 + mvn r4, r4 + ldr r6, [rp, #0] + ldr r5, [up, #-8] + b L(lo3) + +L(fi0): ldr r5, [up], #16 + mvn r5, r5 + ldr r7, [rp], #4 + ldr r4, [up, #-12] + b L(lo0) + +L(fi1): ldr r4, [up], #4 + mvn r4, r4 + ldr r6, [rp], #8 + subs n, n, #1 + beq L(1) + ldr r5, [up] + b L(lo1) + +L(fi2): ldr r5, [up], #8 + mvn r5, r5 + ldr r7, [rp], #12 + ldr r4, [up, #-4] + b L(lo2) + + ALIGN(16) +L(top): ldr r6, [rp, #-8] + ldr r5, [up] + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + add up, up, #16 + mvn r5, r5 + ldr r7, [rp, #-4] + ldr r4, [up, #-12] + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + mvn r4, r4 + ldr r6, [rp, #0] + ldr r5, [up, #-8] + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + mvn r5, r5 + ldr r7, [rp, #4] + ldr r4, [up, #-4] + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + mvn r4, r4 + subs n, n, #4 + bhi L(top) + + ldr r6, [rp, #-8] + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + sub r0, v0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm b/gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm new file mode 100644 index 0000000..be24615 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm @@ -0,0 +1,212 @@ +dnl ARM v6t2 mpn_divrem_1 and mpn_preinv_divrem_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C norm unorm frac +C StrongARM - - - +C XScale - - - +C Cortex-A7 ? ? ? +C Cortex-A8 ? ? ? +C Cortex-A9 13 14 13 +C Cortex-A15 11.4 11.8 11.1 + +C TODO +C * Optimise inner-loops better, they could likely run a cycle or two faster. +C * Decrease register usage, streamline non-loop code. + +define(`qp_arg', `r0') +define(`fn', `r1') +define(`up_arg', `r2') +define(`n_arg', `r3') +define(`d_arg', `0') +define(`dinv_arg',`4') +define(`cnt_arg', `8') + +define(`n', `r9') +define(`qp', `r5') +define(`up', `r6') +define(`cnt', `r7') +define(`tnc', `r10') +define(`dinv', `r0') +define(`d', `r4') + +ASM_START() +PROLOGUE(mpn_preinv_divrem_1) + stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ldr d, [sp, #9*4+d_arg] + ldr cnt, [sp, #9*4+cnt_arg] + str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn + sub n, r3, #1 + add r3, r1, n + cmp d, #0 + add qp, qp_arg, r3, lsl #2 C put qp at Q[] end + add up, up_arg, n, lsl #2 C put up at U[] end + ldr dinv, [sp, #9*4+dinv_arg] + blt L(nent) + b L(uent) +EPILOGUE() + +PROLOGUE(mpn_divrem_1) + stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub n, r3, #1 + ldr d, [sp, #9*4+d_arg] C d + str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn + add r3, r1, n + cmp d, #0 + add qp, qp_arg, r3, lsl #2 C put qp at Q[] end + add up, up_arg, n, lsl #2 C put up at U[] end + blt L(normalised) + +L(unnorm): + clz cnt, d + mov r0, d, lsl cnt C pass d << cnt + bl mpn_invert_limb +L(uent): + mov d, d, lsl cnt C d <<= cnt + cmp n, #0 + mov r1, #0 C r + blt L(frac) + + ldr r11, [up, #0] + + rsb tnc, cnt, #32 + mov r1, r11, lsr tnc + mov r11, r11, lsl cnt + beq L(uend) + + ldr r3, [up, #-4]! + orr r2, r11, r3, lsr tnc + b L(mid) + +L(utop): + mls r1, d, r8, r11 + mov r11, r3, lsl cnt + ldr r3, [up, #-4]! + cmp r1, r2 + addhi r1, r1, d + subhi r8, r8, #1 + orr r2, r11, r3, lsr tnc + cmp r1, d + bcs L(ufx) +L(uok): str r8, [qp], #-4 +L(mid): add r8, r1, #1 + mov r11, r2 + umlal r2, r8, r1, dinv + subs n, n, #1 + bne L(utop) + + mls r1, d, r8, r11 + mov r11, r3, lsl cnt + cmp r1, r2 + addhi r1, r1, d + subhi r8, r8, #1 + cmp r1, d + rsbcs r1, d, r1 + addcs r8, r8, #1 + str r8, [qp], #-4 + +L(uend):add r8, r1, #1 + mov r2, r11 + umlal r2, r8, r1, dinv + mls r1, d, r8, r11 + cmp r1, r2 + addhi r1, r1, d + subhi r8, r8, #1 + cmp r1, d + rsbcs r1, d, r1 + addcs r8, r8, #1 + str r8, [qp], #-4 +L(frac): + ldr r2, [sp, #9*4+d_arg] C fn + cmp r2, #0 + beq L(fend) + +L(ftop):mov r6, #0 + add r3, r1, #1 + umlal r6, r3, r1, dinv + mov r8, #0 + mls r1, d, r3, r8 + cmp r1, r6 + addhi r1, r1, d + subhi r3, r3, #1 + subs r2, r2, #1 + str r3, [qp], #-4 + bne L(ftop) + +L(fend):mov r11, r1, lsr cnt +L(rtn): mov r0, r11 + ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc} + +L(normalised): + mov r0, d + bl mpn_invert_limb +L(nent): + cmp n, #0 + mov r11, #0 C r + blt L(nend) + + ldr r11, [up, #0] + cmp r11, d + movlo r2, #0 C hi q limb + movhs r2, #1 C hi q limb + subhs r11, r11, d + + str r2, [qp], #-4 + cmp n, #0 + beq L(nend) + +L(ntop):ldr r1, [up, #-4]! + add r12, r11, #1 + umlal r1, r12, r11, dinv + ldr r3, [up, #0] + mls r11, d, r12, r3 + cmp r11, r1 + addhi r11, r11, d + subhi r12, r12, #1 + cmp d, r11 + bls L(nfx) +L(nok): str r12, [qp], #-4 + subs n, n, #1 + bne L(ntop) + +L(nend):mov r1, r11 C r + mov cnt, #0 C shift cnt + b L(frac) + +L(nfx): add r12, r12, #1 + rsb r11, d, r11 + b L(nok) +L(ufx): rsb r1, d, r1 + add r8, r8, #1 + b L(uok) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm b/gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm new file mode 100644 index 0000000..8a38351 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm @@ -0,0 +1,65 @@ +dnl ARM v6t2 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C StrongARM - +C XScale - +C Cortex-A5 5.2 +C Cortex-A7 5.04 +C Cortex-A8 3.59 +C Cortex-A9 9.5 +C Cortex-A15 3.2 +C Cortex-A17 5.25 +C Cortex-A53 3.57 + +define(`u0', `r0') +define(`v0', `r1') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + subs r3, u0, v0 C 0 + beq L(end) C + + ALIGN(16) +L(top): rbit r12, r3 C 1,5 + clz r12, r12 C 2 + rsbcc r3, r3, #0 C v = abs(u-v), even 1 + movcs u0, v0 C u = min(u,v) 1 + lsr v0, r3, r12 C 3 + subs r3, u0, v0 C 4 + bne L(top) C + +L(end): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm b/gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm new file mode 100644 index 0000000..3b23808 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm @@ -0,0 +1,113 @@ +dnl ARM v6t2 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C StrongARM - +C XScale - +C Cortex-A5 10.1 +C Cortex-A7 9.1 +C Cortex-A8 6.3 +C Cortex-A9 ? +C Cortex-A12 7.7 +C Cortex-A15 5.7 +C Cortex-A17 ? +C Cortex-A53 7.0 + + +define(`gp', `r0') + +define(`u1', `r1') +define(`u0', `r2') +define(`v1', `r3') +define(`v0', `r4') + +define(`t0', `r5') +define(`t1', `r6') +define(`cnt', `r7') + +ASM_START() +PROLOGUE(mpn_gcd_22) + push { r4-r7 } + + ldr v0, [sp,#16] C + +L(top): subs t0, u0, v0 C 0 7 + beq L(lowz) + sbcs t1, u1, v1 C 1 8 + + rbit cnt, t0 C 1 + + negcc t0, t0 + mvncc t1, t1 +L(bck): movcc v0, u0 + movcc v1, u1 + + clz cnt, cnt C 2 + rsb r12, cnt, #32 C 3 + + lsr u0, t0, cnt C 3 + lsl r12, t1, r12 C 4 + lsr u1, t1, cnt C 3 + orr u0, u0, r12 C 5 + + orrs r12, u1, v1 + bne L(top) + + + str r12, [gp,#4] C high result limb <= 0 + + mov r6, gp + mov r0, u0 C pass 1st argument + mov r1, v0 C pass 2nd argument + mov r7, r14 C preserve link register + bl mpn_gcd_11 + str r0, [r6,#0] + mov r14, r7 + pop { r4-r7 } + bx r14 + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subs t0, u1, v1 + beq L(end) + mov t1, #0 + rbit cnt, t0 C 1 + negcc t0, t0 + b L(bck) + +L(end): str v0, [gp,#0] + str v1, [gp,#4] + pop { r4-r7 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm new file mode 100644 index 0000000..c2277b3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm @@ -0,0 +1,145 @@ +dnl ARM mpn_addmul_1 optimised for A15. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 6 3.25 +C Cortex-A15 2 this + +C This code uses umlal for adding in the rp[] data, keeping the recurrency path +C separate from any multiply instructions. It performs well on A15, at umlal's +C bandwidth. +C +C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm +C for all loads and stores. Alternatively, it could do 2-way or 4-way, but +C then alignment aware code will be necessary (adding O(1) bookkeeping +C overhead). +C +C We don't use r12 due to ldrd and strd limitations. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`v0', `r3') + +define(`w0', `r10') define(`w1', `r11') +define(`u0', `r8') define(`u1', `r9') + +ASM_START() +PROLOGUE(mpn_addmul_1) + push { r4-r11 } + + ands r6, n, #3 + sub n, n, #3 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): mov r6, #0 + cmn r13, #0 C carry clear + ldr u1, [up], #-4 + ldr w1, [rp], #-4 + mov r7, #0 + b L(mid) + +L(b00): ldrd u0, u1, [up] + ldrd w0, w1, [rp] + mov r6, #0 + umlal w0, r6, u0, v0 + cmn r13, #0 C carry clear + mov r7, #0 + str w0, [rp] + b L(mid) + +L(b10): ldrd u0, u1, [up], #8 + ldrd w0, w1, [rp] + mov r4, #0 + umlal w0, r4, u0, v0 + cmn r13, #0 C carry clear + mov r5, #0 + str w0, [rp], #8 + umlal w1, r5, u1, v0 + tst n, n + bmi L(end) + b L(top) + +L(b01): mov r4, #0 + ldr u1, [up], #4 + ldr w1, [rp], #4 + mov r5, #0 + umlal w1, r5, u1, v0 + tst n, n + bmi L(end) + + ALIGN(16) +L(top): ldrd u0, u1, [up, #0] + adcs r4, r4, w1 + ldrd w0, w1, [rp, #0] + mov r6, #0 + umlal w0, r6, u0, v0 C 1 2 + adcs r5, r5, w0 + mov r7, #0 + strd r4, r5, [rp, #-4] +L(mid): umlal w1, r7, u1, v0 C 2 3 + ldrd u0, u1, [up, #8] + adcs r6, r6, w1 + ldrd w0, w1, [rp, #8] + mov r4, #0 + umlal w0, r4, u0, v0 C 3 4 + adcs r7, r7, w0 + mov r5, #0 + strd r6, r7, [rp, #4] + umlal w1, r5, u1, v0 C 0 1 + sub n, n, #4 + add up, up, #16 + add rp, rp, #16 + tst n, n + bpl L(top) + +L(end): adcs r4, r4, w1 + str r4, [rp, #-4] + adc r0, r5, #0 + pop { r4-r11 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm new file mode 100644 index 0000000..dc3f839 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm @@ -0,0 +1,162 @@ +dnl ARM mpn_add_n/mpn_sub_n optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.55 2.5 +C Cortex-A15 1.27 this + +C This was a major improvement compared to the code we had before, but it might +C not be the best 8-way code possible. We've tried some permutations of auto- +C increments and separate pointer updates, but they all ran at the same speed +C on A15. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_add_n', ` + define(`ADDSUBC', adcs) + define(`IFADD', `$1') + define(`SETCY', `cmp $1, #1') + define(`RETVAL', `adc r0, n, #0') + define(`RETVAL2', `adc r0, n, #1') + define(`func', mpn_add_n) + define(`func_nc', mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(`ADDSUBC', sbcs) + define(`IFADD', `') + define(`SETCY', `rsbs $1, $1, #0') + define(`RETVAL', `sbc r0, r0, r0 + and r0, r0, #1') + define(`RETVAL2', `RETVAL') + define(`func', mpn_sub_n) + define(`func_nc', mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + ldr r12, [sp] + b L(ent) +EPILOGUE() +PROLOGUE(func) + mov r12, #0 +L(ent): push { r4-r9 } + + ands r6, n, #3 + mov n, n, lsr #2 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): ldr r5, [up], #4 + ldr r7, [vp], #4 + SETCY( r12) + ADDSUBC r9, r5, r7 + ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + str r9, [rp], #-4 + b L(lo) + +L(b00): ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + SETCY( r12) + sub rp, rp, #16 + b L(mid) + +L(b01): ldr r5, [up], #-4 + ldr r7, [vp], #-4 + SETCY( r12) + ADDSUBC r9, r5, r7 + str r9, [rp], #-12 + tst n, n + beq L(wd1) +L(gt1): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + b L(mid) + +L(b10): ldrd r4, r5, [up] + ldrd r6, r7, [vp] + SETCY( r12) + sub rp, rp, #8 + b L(lo) + + ALIGN(16) +L(top): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + strd r8, r9, [rp, #8] +L(mid): ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + ldrd r4, r5, [up, #16] + ldrd r6, r7, [vp, #16] + strd r8, r9, [rp, #16] + ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + sub n, n, #2 + tst n, n + bmi L(dne) + ldrd r4, r5, [up, #24] + ldrd r6, r7, [vp, #24] + strd r8, r9, [rp, #24] + ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + ldrd r4, r5, [up, #32]! + ldrd r6, r7, [vp, #32]! + strd r8, r9, [rp, #32]! +L(lo): ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + tst n, n + bne L(top) + +L(end): strd r8, r9, [rp, #8] +L(wd1): RETVAL + pop { r4-r9 } + bx r14 +L(dne): strd r8, r9, [rp, #24] + RETVAL2 + pop { r4-r9 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm new file mode 100644 index 0000000..245b371 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm @@ -0,0 +1,36 @@ +dnl ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +include_mpn(`arm/v7a/cora8/bdiv_q_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm new file mode 100644 index 0000000..b9e5cd3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm @@ -0,0 +1,158 @@ +dnl ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.75 3 +C Cortex-A15 1.78 this + +C This code does not run as well as one could have hoped, since 1.5 c/l seems +C realistic for this insn mix. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`cnd',`r0') +define(`rp', `r1') +define(`up', `r2') +define(`vp', `r3') +define(`n', `r12') + +ifdef(`OPERATION_cnd_add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`IFADD', `$1') + define(`INITCY', `cmn r0, #0') + define(`RETVAL', `adc r0, n, #0') + define(`RETVAL2', `adc r0, n, #1') + define(`func', mpn_cnd_add_n) + define(`func_nc', mpn_add_nc)') +ifdef(`OPERATION_cnd_sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`IFADD', `') + define(`INITCY', `cmp r0, #0') + define(`RETVAL', `sbc r0, r0, r0 + and r0, r0, #1') + define(`RETVAL2', `RETVAL') + define(`func', mpn_cnd_sub_n) + define(`func_nc', mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + ldr n, [sp] + push { r4-r9 } + + cmp cnd, #1 + sbc cnd, cnd, cnd C conditionally set to 0xffffffff + + ands r6, n, #3 + mov n, n, lsr #2 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): ldr r5, [up], #4 + ldr r7, [vp], #4 + bic r7, r7, cnd + ADDSUB r9, r5, r7 + ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + bic r6, r6, cnd + bic r7, r7, cnd + str r9, [rp], #-4 + b L(lo) + +L(b00): ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + bic r6, r6, cnd + bic r7, r7, cnd + INITCY + sub rp, rp, #16 + b L(mid) + +L(b01): ldr r5, [up], #-4 + ldr r7, [vp], #-4 + bic r7, r7, cnd + ADDSUB r9, r5, r7 + str r9, [rp], #-12 + tst n, n + beq L(wd1) +L(gt1): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + bic r6, r6, cnd + bic r7, r7, cnd + b L(mid) + +L(b10): ldrd r4, r5, [up] + ldrd r6, r7, [vp] + bic r6, r6, cnd + bic r7, r7, cnd + INITCY + sub rp, rp, #8 + b L(lo) + + ALIGN(16) +L(top): ldrd r6, r7, [vp, #8] + ldrd r4, r5, [up, #8] + bic r6, r6, cnd + bic r7, r7, cnd + strd r8, r9, [rp, #8] +L(mid): ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + ldrd r6, r7, [vp, #16]! + ldrd r4, r5, [up, #16]! + bic r6, r6, cnd + bic r7, r7, cnd + sub n, n, #1 + strd r8, r9, [rp, #16]! +L(lo): ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + tst n, n + bne L(top) + +L(end): strd r8, r9, [rp, #8] +L(wd1): RETVAL + pop { r4-r9 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm new file mode 100644 index 0000000..a258afe --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm @@ -0,0 +1,180 @@ +dnl ARM mpn_com optimised for A15. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 2.5 +C Cortex-A15 1.0 + +C This is great A15 core register code, but it is a bit large. +C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2 +define(`UNROLL', 4x2) C alternatives: 4 4x2 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_com) + push { r4-r5,r8-r9 } + +ifelse(FEEDIN_VARIANT,0,` + ands r12, n, #3 + mov n, n, lsr #2 + beq L(b00a) + tst r12, #1 + beq L(bx0) + ldr r5, [up], #4 + mvn r9, r5 + str r9, [rp], #4 + tst r12, #2 + beq L(b00) +L(bx0): ldrd r4, r5, [up, #0] + sub rp, rp, #8 + b L(lo) +L(b00): tst n, n + beq L(wd1) +L(b00a):ldrd r4, r5, [up], #-8 + sub rp, rp, #16 + b L(mid) +') +ifelse(FEEDIN_VARIANT,1,` + and r12, n, #3 + mov n, n, lsr #2 + tst r12, #1 + beq L(bx0) + ldr r5, [up], #4 + mvn r9, r5 + str r9, [rp], #4 +L(bx0): tst r12, #2 + beq L(b00) + ldrd r4, r5, [up, #0] + sub rp, rp, #8 + b L(lo) +L(b00): tst n, n + beq L(wd1) + ldrd r4, r5, [up], #-8 + sub rp, rp, #16 + b L(mid) +') +ifelse(FEEDIN_VARIANT,2,` + ands r12, n, #3 + mov n, n, lsr #2 + beq L(b00) + cmp r12, #2 + bcc L(b01) + beq L(b10) + +L(b11): ldr r5, [up], #4 + mvn r9, r5 + ldrd r4, r5, [up, #0] + str r9, [rp], #-4 + b L(lo) + +L(b00): ldrd r4, r5, [up], #-8 + sub rp, rp, #16 + b L(mid) + +L(b01): ldr r5, [up], #-4 + mvn r9, r5 + str r9, [rp], #-12 + tst n, n + beq L(wd1) +L(gt1): ldrd r4, r5, [up, #8] + b L(mid) + +L(b10): ldrd r4, r5, [up] + sub rp, rp, #8 + b L(lo) +') + ALIGN(16) +ifelse(UNROLL,4,` +L(top): ldrd r4, r5, [up, #8] + strd r8, r9, [rp, #8] +L(mid): mvn r8, r4 + mvn r9, r5 + ldrd r4, r5, [up, #16]! + strd r8, r9, [rp, #16]! + sub n, n, #1 +L(lo): mvn r8, r4 + mvn r9, r5 + tst n, n + bne L(top) +') +ifelse(UNROLL,4x2,` +L(top): ldrd r4, r5, [up, #8] + strd r8, r9, [rp, #8] +L(mid): mvn r8, r4 + mvn r9, r5 + ldrd r4, r5, [up, #16] + strd r8, r9, [rp, #16] + mvn r8, r4 + mvn r9, r5 + sub n, n, #2 + tst n, n + bmi L(dne) + ldrd r4, r5, [up, #24] + strd r8, r9, [rp, #24] + mvn r8, r4 + mvn r9, r5 + ldrd r4, r5, [up, #32]! + strd r8, r9, [rp, #32]! +L(lo): mvn r8, r4 + mvn r9, r5 + tst n, n + bne L(top) +') + +L(end): strd r8, r9, [rp, #8] +L(wd1): pop { r4-r5,r8-r9 } + bx r14 +ifelse(UNROLL,4x2,` +L(dne): strd r8, r9, [rp, #24] + pop { r4-r5,r8-r9 } + bx r14 +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h new file mode 100644 index 0000000..409cbbb --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h @@ -0,0 +1,212 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2000 MHz Cortex-A15 with Neon (in spite of file position) */ +/* FFT tuning limit = 50,736,668 */ +/* Generated by tuneup.c, 2019-10-22, gcc 5.4 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 49.14% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 17 + +#define DIV_1_VS_MUL_1_PERCENT 267 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 114 +#define MUL_TOOM44_THRESHOLD 178 +#define MUL_TOOM6H_THRESHOLD 238 +#define MUL_TOOM8H_THRESHOLD 597 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 115 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 115 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 38 +#define SQR_TOOM3_THRESHOLD 126 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 650 + +#define MULMID_TOOM42_THRESHOLD 52 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 575 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 575, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 19, 6}, \ + { 39, 7}, { 25, 6}, { 51, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 51, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 55,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 71, 8}, { 143, 9}, \ + { 87,10}, { 47, 9}, { 111,11}, { 31,10}, \ + { 63, 9}, { 143,10}, { 79, 9}, { 159,10}, \ + { 95,11}, { 63,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 335, 9}, \ + { 671,10}, { 367, 9}, { 735,11}, { 191,10}, \ + { 383, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 575,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 831,12}, { 447,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1151,12}, \ + { 639,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 895,14}, { 255,13}, \ + { 511,12}, { 1087,13}, { 639,12}, { 1407,13}, \ + { 767,12}, { 1599,13}, { 895,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1279,12}, { 2559,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3135,13}, { 1663,15}, { 511,14}, \ + { 1023,13}, { 2303,14}, { 1279,13}, { 2559,12}, \ + { 5119,13}, { 2687,14}, { 1535,13}, { 3071,12}, \ + { 6143,13}, { 3199,12}, { 6399,14}, { 1791,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2303,13}, \ + { 4607,12}, { 9215,13}, { 4863,12}, { 9727,14}, \ + { 2559,13}, { 5119,15}, { 1535,14}, { 3071,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 155 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 525, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 25, 6}, { 51, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 51, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 39, 9}, \ + { 23, 8}, { 51,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 95,11}, { 63,10}, { 143, 9}, \ + { 287, 8}, { 575, 9}, { 303,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,12}, { 127,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,10}, { 831,11}, \ + { 447,13}, { 127,11}, { 543,10}, { 1087,11}, \ + { 607,12}, { 319,11}, { 735,12}, { 383,11}, \ + { 831,12}, { 447,11}, { 959,12}, { 511,11}, \ + { 1023,12}, { 575,11}, { 1151,12}, { 639,11}, \ + { 1279,12}, { 703,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1663,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 639,12}, \ + { 1343,13}, { 767,12}, { 1599,13}, { 895,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2303,13}, { 1279,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,15}, { 511,14}, { 1023,13}, \ + { 2047,12}, { 4095,13}, { 2303,14}, { 1279,13}, \ + { 2559,12}, { 5119,14}, { 1535,13}, { 3071,12}, \ + { 6143,13}, { 3199,12}, { 6399,14}, { 1791,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2303,13}, \ + { 4607,12}, { 9215,13}, { 4863,12}, { 9727,14}, \ + { 2559,15}, { 1535,14}, { 3071,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 154 +#define SQR_FFT_THRESHOLD 5312 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 38 +#define MULLO_MUL_N_THRESHOLD 10950 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 35 +#define SQRLO_SQR_THRESHOLD 10323 + +#define DC_DIV_QR_THRESHOLD 57 +#define DC_DIVAPPR_Q_THRESHOLD 254 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 286 + +#define INV_MULMOD_BNM1_THRESHOLD 55 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 252 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_2_THRESHOLD 61 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1858 +#define MU_DIVAPPR_Q_THRESHOLD 1787 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1836 + +#define POWM_SEC_TABLE 1,14,200,480,1532 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_DC_THRESHOLD 104 +#define SET_STR_PRECOMPUTE_THRESHOLD 1120 + +#define FAC_DSC_THRESHOLD 164 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 3.70% faster than 3 */ +#define HGCD_THRESHOLD 137 +#define HGCD_APPR_THRESHOLD 157 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 610 +#define GCDEXT_DC_THRESHOLD 443 +#define JACOBI_BASE_METHOD 4 /* 12.66% faster than 1 */ + +/* Tuneup completed successfully, took 69757 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm new file mode 100644 index 0000000..0602614 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm @@ -0,0 +1,253 @@ +dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C and andn ior xor nand iorn nior xnor +C StrongARM ? ? +C XScale ? ? +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3.5 3.56 +C Cortex-A15 1.27 1.64 + +C This is great A15 core register code, but it is a bit large. +C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2 +define(`UNROLL', 4x2) C alternatives: 4 4x2 + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +define(`POSTOP') + +ifdef(`OPERATION_and_n',` + define(`func', `mpn_and_n') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_andn_n',` + define(`func', `mpn_andn_n') + define(`LOGOP', `bic $1, $2, $3')') +ifdef(`OPERATION_nand_n',` + define(`func', `mpn_nand_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_ior_n',` + define(`func', `mpn_ior_n') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_iorn_n',` + define(`func', `mpn_iorn_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `bic $1, $3, $2')') +ifdef(`OPERATION_nior_n',` + define(`func', `mpn_nior_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_xor_n',` + define(`func', `mpn_xor_n') + define(`LOGOP', `eor $1, $2, $3')') +ifdef(`OPERATION_xnor_n',` + define(`func', `mpn_xnor_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `eor $1, $2, $3')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + push { r4-r9 } + +ifelse(FEEDIN_VARIANT,0,` + ands r6, n, #3 + mov n, n, lsr #2 + beq L(b00a) + tst r6, #1 + beq L(bx0) + ldr r5, [up], #4 + ldr r7, [vp], #4 + LOGOP( r9, r5, r7) + POSTOP( r9) + str r9, [rp], #4 + tst r6, #2 + beq L(b00) +L(bx0): ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + sub rp, rp, #8 + b L(lo) +L(b00): tst n, n + beq L(wd1) +L(b00a):ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + sub rp, rp, #16 + b L(mid) +') +ifelse(FEEDIN_VARIANT,1,` + and r6, n, #3 + mov n, n, lsr #2 + tst r6, #1 + beq L(bx0) + ldr r5, [up], #4 + ldr r7, [vp], #4 + LOGOP( r9, r5, r7) + POSTOP( r9) + str r9, [rp], #4 +L(bx0): tst r6, #2 + beq L(b00) + ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + sub rp, rp, #8 + b L(lo) +L(b00): tst n, n + beq L(wd1) + ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + sub rp, rp, #16 + b L(mid) +') +ifelse(FEEDIN_VARIANT,2,` + ands r6, n, #3 + mov n, n, lsr #2 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): ldr r5, [up], #4 + ldr r7, [vp], #4 + LOGOP( r9, r5, r7) + ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + POSTOP( r9) + str r9, [rp], #-4 + b L(lo) + +L(b00): ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + sub rp, rp, #16 + b L(mid) + +L(b01): ldr r5, [up], #-4 + ldr r7, [vp], #-4 + LOGOP( r9, r5, r7) + POSTOP( r9) + str r9, [rp], #-12 + tst n, n + beq L(wd1) +L(gt1): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + b L(mid) + +L(b10): ldrd r4, r5, [up] + ldrd r6, r7, [vp] + sub rp, rp, #8 + b L(lo) +') + ALIGN(16) +ifelse(UNROLL,4,` +L(top): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #8] +L(mid): LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + ldrd r4, r5, [up, #16]! + ldrd r6, r7, [vp, #16]! + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #16]! + sub n, n, #1 +L(lo): LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + tst n, n + bne L(top) +') +ifelse(UNROLL,4x2,` +L(top): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #8] +L(mid): LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + ldrd r4, r5, [up, #16] + ldrd r6, r7, [vp, #16] + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #16] + LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + sub n, n, #2 + tst n, n + bmi L(dne) + ldrd r4, r5, [up, #24] + ldrd r6, r7, [vp, #24] + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #24] + LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + ldrd r4, r5, [up, #32]! + ldrd r6, r7, [vp, #32]! + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #32]! +L(lo): LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + tst n, n + bne L(top) +') + +L(end): POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #8] +L(wd1): pop { r4-r9 } + bx r14 +ifelse(UNROLL,4x2,` +L(dne): POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #24] + pop { r4-r9 } + bx r14 +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm new file mode 100644 index 0000000..766ba5c --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm @@ -0,0 +1,104 @@ +dnl ARM mpn_mul_1 optimised for A15. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.25 3.25 +C Cortex-A15 2.25 this + + +C This runs well on A15 but very poorly on A9. By scheduling loads and adds +C it is possible to get good A9 performance as well, but at the cost of using +C many more (callee-saves) registers. + +C This is armv5 code, optimized for the armv7a cpu A15. Its location in the +C GMP file structure might be misleading. + + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`v0', `r3') + +ASM_START() +PROLOGUE(mpn_mul_1c) + ldr r12, [sp] + b L(ent) +EPILOGUE() +PROLOGUE(mpn_mul_1) + mov r12, #0 +L(ent): push {r4-r7} + + ldr r6, [up], #4 + tst n, #1 + beq L(bx0) + +L(bx1): umull r4, r7, r6, v0 + adds r4, r4, r12 + tst n, #2 + beq L(lo1) + b L(lo3) + +L(bx0): umull r4, r5, r6, v0 + adds r4, r4, r12 + tst n, #2 + beq L(lo0) + b L(lo2) + +L(top): ldr r6, [up], #4 + str r4, [rp], #4 + umull r4, r5, r6, v0 + adds r4, r4, r7 +L(lo0): ldr r6, [up], #4 + str r4, [rp], #4 + umull r4, r7, r6, v0 + adcs r4, r4, r5 +L(lo3): ldr r6, [up], #4 + str r4, [rp], #4 + umull r4, r5, r6, v0 + adcs r4, r4, r7 +L(lo2): ldr r6, [up], #4 + str r4, [rp], #4 + umull r4, r7, r6, v0 + adcs r4, r4, r5 +L(lo1): adc r7, r7, #0 + subs n, n, #4 + bgt L(top) + + str r4, [rp] + mov r0, r7 + pop {r4-r7} + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm new file mode 100644 index 0000000..d8cfe3f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm @@ -0,0 +1,43 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm new file mode 100644 index 0000000..b48204d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm @@ -0,0 +1,43 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm new file mode 100644 index 0000000..51f93c1 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm @@ -0,0 +1,144 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.25 +C Cortex-A15 2.25 + +C TODO +C * Consider using 4-way feed-in code. +C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps +C insufficiently for A7 and A8. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`DO_add', ` + define(`ADCSBCS', `adcs $1, $2, $3') + define(`CLRCY', `cmn r13, #1') + define(`RETVAL', `adc r0, $1, #0') + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADCSBCS', `sbcs $1, $2, $3') + define(`CLRCY', `cmp r13, #0') + define(`RETVAL', `sbc $2, $2, $2 + cmn $2, #1 + adc r0, $1, #0') + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADCSBCS', `sbcs $1, $3, $2') + define(`CLRCY', `cmp r13, #0') + define(`RETVAL', `sbc r0, $1, #0') + define(`func', mpn_rsblsh`'LSH`'_n)') + + +ASM_START() +PROLOGUE(func) + push {r4-r10} + vmov.i8 d0, #0 C could feed carry through here + CLRCY + tst n, #1 + beq L(bb0) + +L(bb1): vld1.32 {d3[0]}, [vp]! + vsli.u32 d0, d3, #LSH + ldr r12, [up], #4 + vmov.32 r5, d0[0] + vshr.u32 d0, d3, #32-LSH + ADCSBCS( r12, r12, r5) + str r12, [rp], #4 + bics n, n, #1 + beq L(rtn) + +L(bb0): tst n, #2 + beq L(b00) + +L(b10): vld1.32 {d3}, [vp]! + vsli.u64 d0, d3, #LSH + ldmia up!, {r10,r12} + vmov r4, r5, d0 + vshr.u64 d0, d3, #64-LSH + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r10,r12} + bics n, n, #2 + beq L(rtn) + +L(b00): vld1.32 {d2}, [vp]! + vsli.u64 d0, d2, #LSH + vshr.u64 d1, d2, #64-LSH + vld1.32 {d3}, [vp]! + vsli.u64 d1, d3, #LSH + vmov r6, r7, d0 + vshr.u64 d0, d3, #64-LSH + sub n, n, #4 + tst n, n + beq L(end) + + ALIGN(16) +L(top): ldmia up!, {r8,r9,r10,r12} + vld1.32 {d2}, [vp]! + vsli.u64 d0, d2, #LSH + vmov r4, r5, d1 + vshr.u64 d1, d2, #64-LSH + ADCSBCS( r8, r8, r6) + ADCSBCS( r9, r9, r7) + vld1.32 {d3}, [vp]! + vsli.u64 d1, d3, #LSH + vmov r6, r7, d0 + vshr.u64 d0, d3, #64-LSH + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r8,r9,r10,r12} + sub n, n, #4 + tst n, n + bne L(top) + +L(end): ldmia up!, {r8,r9,r10,r12} + vmov r4, r5, d1 + ADCSBCS( r8, r8, r6) + ADCSBCS( r9, r9, r7) + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r8,r9,r10,r12} +L(rtn): vmov.32 r0, d0[0] + RETVAL( r0, r1) + pop {r4-r10} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm new file mode 100644 index 0000000..9e7a629 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm @@ -0,0 +1,97 @@ +dnl ARM Neon mpn_com optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A8 ? +C Cortex-A9 2.1 +C Cortex-A15 0.65 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_com) + cmp n, #7 + ble L(bc) + +C Perform a few initial operation until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + vld1.32 {d0[0]}, [up]! + sub n, n, #1 + vmvn d0, d0 + vst1.32 {d0[0]}, [rp]! +L(al1): tst rp, #8 + beq L(al2) + vld1.32 {d0}, [up]! + sub n, n, #2 + vmvn d0, d0 + vst1.32 {d0}, [rp:64]! +L(al2): vld1.32 {q2}, [up]! + subs n, n, #12 + blt L(end) + + ALIGN(16) +L(top): vld1.32 {q0}, [up]! + vmvn q2, q2 + subs n, n, #8 + vst1.32 {q2}, [rp:128]! + vld1.32 {q2}, [up]! + vmvn q0, q0 + vst1.32 {q0}, [rp:128]! + bge L(top) + +L(end): vmvn q2, q2 + vst1.32 {q2}, [rp:128]! + +C Handle last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + vld1.32 {q0}, [up]! + vmvn q0, q0 + vst1.32 {q0}, [rp]! +L(tl1): tst n, #2 + beq L(tl2) + vld1.32 {d0}, [up]! + vmvn d0, d0 + vst1.32 {d0}, [rp]! +L(tl2): tst n, #1 + beq L(tl3) + vld1.32 {d0[0]}, [up] + vmvn d0, d0 + vst1.32 {d0[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm new file mode 100644 index 0000000..98fe535 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm @@ -0,0 +1,110 @@ +dnl ARM Neon mpn_copyd optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.75 slower than core register code +C Cortex-A15 0.52 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyd) + add rp, rp, n, lsl #2 + add up, up, n, lsl #2 + + cmp n, #7 + ble L(bc) + +C Copy until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + sub up, up, #4 + vld1.32 {d22[0]}, [up] + sub n, n, #1 + sub rp, rp, #4 + vst1.32 {d22[0]}, [rp] +L(al1): tst rp, #8 + beq L(al2) + sub up, up, #8 + vld1.32 {d22}, [up] + sub n, n, #2 + sub rp, rp, #8 + vst1.32 {d22}, [rp:64] +L(al2): sub up, up, #16 + vld1.32 {d26-d27}, [up] + subs n, n, #12 + sub rp, rp, #16 C offset rp for loop + blt L(end) + + sub up, up, #16 C offset up for loop + mov r12, #-16 + + ALIGN(16) +L(top): vld1.32 {d22-d23}, [up], r12 + vst1.32 {d26-d27}, [rp:128], r12 + vld1.32 {d26-d27}, [up], r12 + vst1.32 {d22-d23}, [rp:128], r12 + subs n, n, #8 + bge L(top) + + add up, up, #16 C undo up offset + C rp offset undoing folded +L(end): vst1.32 {d26-d27}, [rp:128] + +C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + sub up, up, #16 + vld1.32 {d22-d23}, [up] + sub rp, rp, #16 + vst1.32 {d22-d23}, [rp] +L(tl1): tst n, #2 + beq L(tl2) + sub up, up, #8 + vld1.32 {d22}, [up] + sub rp, rp, #8 + vst1.32 {d22}, [rp] +L(tl2): tst n, #1 + beq L(tl3) + sub up, up, #4 + vld1.32 {d22[0]}, [up] + sub rp, rp, #4 + vst1.32 {d22[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm new file mode 100644 index 0000000..2e05afe --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm @@ -0,0 +1,90 @@ +dnl ARM Neon mpn_copyi optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.75 slower than core register code +C Cortex-A15 0.52 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyi) + cmp n, #7 + ble L(bc) + +C Copy until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + vld1.32 {d22[0]}, [up]! + sub n, n, #1 + vst1.32 {d22[0]}, [rp]! +L(al1): tst rp, #8 + beq L(al2) + vld1.32 {d22}, [up]! + sub n, n, #2 + vst1.32 {d22}, [rp:64]! +L(al2): vld1.32 {d26-d27}, [up]! + subs n, n, #12 + blt L(end) + + ALIGN(16) +L(top): vld1.32 {d22-d23}, [up]! + vst1.32 {d26-d27}, [rp:128]! + vld1.32 {d26-d27}, [up]! + vst1.32 {d22-d23}, [rp:128]! + subs n, n, #8 + bge L(top) + +L(end): vst1.32 {d26-d27}, [rp:128]! + +C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + vld1.32 {d22-d23}, [up]! + vst1.32 {d22-d23}, [rp]! +L(tl1): tst n, #2 + beq L(tl2) + vld1.32 {d22}, [up]! + vst1.32 {d22}, [rp]! +L(tl2): tst n, #1 + beq L(tl3) + vld1.32 {d22[0]}, [up] + vst1.32 {d22[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm new file mode 100644 index 0000000..2c11d6d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm @@ -0,0 +1,177 @@ +dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4-5 +C Cortex-A15 2.5 + +C TODO +C * Try to make this smaller, its size (384 bytes) is excessive. +C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family. +C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps +C insufficiently for A7 and A8. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUBS', `adds $1, $2, $3') + define(`ADCSBCS', `adcs $1, $2, $3') + define(`IFADD', `$1') + define(`IFSUB', `') + define(`func', mpn_rsh1add_n)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUBS', `subs $1, $2, $3') + define(`ADCSBCS', `sbcs $1, $2, $3') + define(`IFADD', `') + define(`IFSUB', `$1') + define(`func', mpn_rsh1sub_n)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + push {r4-r10} + + ands r4, n, #3 + beq L(b00) + cmp r4, #2 + blo L(b01) + beq L(b10) + +L(b11): ldmia up!, {r9,r10,r12} + ldmia vp!, {r5,r6,r7} + ADDSUBS( r9, r9, r5) + vmov d4, r9, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vshr.u64 d3, d4, #1 + vmov d1, r10, r12 + vsli.u64 d3, d1, #31 + vshr.u64 d2, d1, #1 + vst1.32 d3[0], [rp]! + bics n, n, #3 + beq L(wd2) +L(gt3): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + b L(mi0) + +L(b10): ldmia up!, {r10,r12} + ldmia vp!, {r6,r7} + ADDSUBS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vmov d4, r10, r12 + bics n, n, #2 + vshr.u64 d2, d4, #1 + beq L(wd2) +L(gt2): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + b L(mi0) + +L(b01): ldr r12, [up], #4 + ldr r7, [vp], #4 + ADDSUBS( r12, r12, r7) + vmov d4, r12, r12 + bics n, n, #1 + bne L(gt1) + mov r5, r12, lsr #1 +IFADD(` adc r1, n, #0') +IFSUB(` adc r1, n, #1') + bfi r5, r1, #31, #1 + str r5, [rp] + and r0, r12, #1 + pop {r4-r10} + bx r14 +L(gt1): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + vshr.u64 d2, d4, #1 + ADCSBCS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d0, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vsli.u64 d2, d0, #31 + vshr.u64 d3, d0, #1 + vst1.32 d2[0], [rp]! + b L(mi1) + +L(b00): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + ADDSUBS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d4, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vshr.u64 d3, d4, #1 + b L(mi1) + + ALIGN(16) +L(top): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + vsli.u64 d3, d1, #63 + vshr.u64 d2, d1, #1 + vst1.32 d3, [rp]! +L(mi0): ADCSBCS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d0, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vsli.u64 d2, d0, #63 + vshr.u64 d3, d0, #1 + vst1.32 d2, [rp]! +L(mi1): vmov d1, r10, r12 + sub n, n, #4 + tst n, n + bne L(top) + +L(end): vsli.u64 d3, d1, #63 + vshr.u64 d2, d1, #1 + vst1.32 d3, [rp]! +L(wd2): vmov r4, r5, d2 +IFADD(` adc r1, n, #0') +IFSUB(` adc r1, n, #1') + bfi r5, r1, #31, #1 + stm rp, {r4,r5} + +L(rtn): vmov.32 r0, d4[0] + and r0, r0, #1 + pop {r4-r10} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm new file mode 100644 index 0000000..ed7bfe8 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm @@ -0,0 +1,159 @@ +dnl ARM mpn_submul_1 optimised for A15. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.75 3.75 +C Cortex-A15 2.32 this + +C This code uses umlal and umaal for adding in the rp[] data, keeping the +C recurrency path separate from any multiply instructions. It performs well on +C A15, but not quite at the multiply bandwidth like the corresponding addmul_1 +C code. +C +C We don't use r12 due to ldrd and strd limitations. +C +C This loop complements U on the fly, +C U' = B^n - 1 - U +C and then uses that +C R - U*v = R + U'*v + v - B^n v + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 umaal +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`v0', `r3') + +define(`w0', `r10') define(`w1', `r11') +define(`u0', `r8') define(`u1', `r9') + +ASM_START() +PROLOGUE(mpn_submul_1) + sub sp, sp, #32 + strd r10, r11, [sp, #24] + strd r8, r9, [sp, #16] + strd r6, r7, [sp, #8] + strd r4, r5, [sp, #0] +C push { r4-r11 } + + ands r6, n, #3 + sub n, n, #3 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): mov r6, #0 + ldr u1, [up], #-4 + ldr w1, [rp], #-16 + mvn u1, u1 + adds r7, v0, #0 + b L(mid) + +L(b00): ldrd u0, u1, [up] + ldrd w0, w1, [rp], #-12 + mvn u0, u0 + mvn u1, u1 + mov r6, v0 + umaal w0, r6, u0, v0 + cmn r13, #0 C carry clear + mov r7, #0 + str w0, [rp, #12] + b L(mid) + +L(b10): ldrd u0, u1, [up], #8 + ldrd w0, w1, [rp] + mvn u0, u0 + mvn u1, u1 + mov r4, v0 + umaal w0, r4, u0, v0 + mov r5, #0 + str w0, [rp], #-4 + umlal w1, r5, u1, v0 + adds n, n, #0 + bmi L(end) + b L(top) + +L(b01): ldr u1, [up], #4 + ldr w1, [rp], #-8 + mvn u1, u1 + mov r5, v0 + mov r4, #0 + umaal w1, r5, u1, v0 + tst n, n + bmi L(end) + +C ALIGN(16) +L(top): ldrd u0, u1, [up, #0] + adcs r4, r4, w1 + mvn u0, u0 + ldrd w0, w1, [rp, #12] + mvn u1, u1 + mov r6, #0 + umlal w0, r6, u0, v0 C 1 2 + adcs r5, r5, w0 + mov r7, #0 + strd r4, r5, [rp, #8] +L(mid): umaal w1, r7, u1, v0 C 2 3 + ldrd u0, u1, [up, #8] + add up, up, #16 + adcs r6, r6, w1 + mvn u0, u0 + ldrd w0, w1, [rp, #20] + mvn u1, u1 + mov r4, #0 + umlal w0, r4, u0, v0 C 3 4 + adcs r7, r7, w0 + mov r5, #0 + strd r6, r7, [rp, #16]! + sub n, n, #4 + umlal w1, r5, u1, v0 C 0 1 + tst n, n + bpl L(top) + +L(end): adcs r4, r4, w1 + str r4, [rp, #8] + adc r0, r5, #0 + sub r0, v0, r0 + pop { r4-r11 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm new file mode 100644 index 0000000..c11ed47 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm @@ -0,0 +1,34 @@ +dnl ARM mpn_addmul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_addmul_1) +include_mpn(`arm/v6/addmul_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h new file mode 100644 index 0000000..143d4bc --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h @@ -0,0 +1,233 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1800 MHz Cortex-A17 with Neon (in spite of file position) */ +/* FFT tuning limit = 51243975 */ +/* Generated by tuneup.c, 2019-10-29, gcc 6.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 54.08% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 45 + +#define DIV_1_VS_MUL_1_PERCENT 248 + +#define MUL_TOOM22_THRESHOLD 38 +#define MUL_TOOM33_THRESHOLD 132 +#define MUL_TOOM44_THRESHOLD 200 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 179 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 191 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 62 +#define SQR_TOOM3_THRESHOLD 189 +#define SQR_TOOM4_THRESHOLD 354 +#define SQR_TOOM6_THRESHOLD 426 +#define SQR_TOOM8_THRESHOLD 608 + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 21 +#define SQRMOD_BNM1_THRESHOLD 29 + +#define MUL_FFT_MODF_THRESHOLD 595 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 595, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 43, 9}, { 23, 8}, { 55, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 83, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \ + { 143, 8}, { 575,10}, { 159,11}, { 95,10}, \ + { 191, 9}, { 383, 8}, { 767, 9}, { 399, 8}, \ + { 799,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511, 8}, { 1023, 9}, { 543, 8}, { 1087, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,10}, { 367, 9}, { 735,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \ + { 415, 9}, { 831,10}, { 431, 9}, { 863,11}, \ + { 223,10}, { 447,12}, { 127,10}, { 511, 9}, \ + { 1023,10}, { 543, 9}, { 1087,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \ + { 351,10}, { 735,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 863,11}, { 447,10}, \ + { 895,13}, { 127,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,10}, { 1727,12}, { 447,11}, { 991,10}, \ + { 1983,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1983,13}, { 511,12}, { 1087,11}, { 2239,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,12}, { 3455,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2559,13}, \ + { 5247,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 194 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 500, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767, 9}, { 399,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,10}, { 367, 9}, \ + { 735,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415, 9}, { 831,10}, \ + { 431, 9}, { 863,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 735,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,11}, { 447,10}, { 895,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 959,10}, { 1919,11}, { 991,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \ + { 1471,11}, { 2943,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \ + { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3455,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2559,13}, { 5119,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 199 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 26 +#define SQRLO_SQR_THRESHOLD 8907 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 103 +#define DC_BDIV_QR_THRESHOLD 44 +#define DC_BDIV_Q_THRESHOLD 98 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 165 +#define INV_APPR_THRESHOLD 115 + +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_2_THRESHOLD 2 +#define REDC_2_TO_REDC_N_THRESHOLD 147 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 2089 +#define MUPI_DIV_QR_THRESHOLD 70 +#define MU_BDIV_QR_THRESHOLD 1718 +#define MU_BDIV_Q_THRESHOLD 2089 + +#define POWM_SEC_TABLE 7,19,107,480,1486 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 126 +#define SET_STR_PRECOMPUTE_THRESHOLD 541 + +#define FAC_DSC_THRESHOLD 132 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 30 +#define HGCD2_DIV1_METHOD 1 /* 6.55% faster than 3 */ +#define HGCD_THRESHOLD 54 +#define HGCD_APPR_THRESHOLD 52 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 303 +#define GCDEXT_DC_THRESHOLD 225 +#define JACOBI_BASE_METHOD 4 /* 9.73% faster than 1 */ + +/* Tuneup completed successfully, took 111418 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm b/gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm new file mode 100644 index 0000000..39e5a15 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm @@ -0,0 +1,121 @@ +dnl ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2012, 2013, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A5 2.67 +C Cortex-A7 2.37 +C Cortex-A8 2.34 +C Cortex-A9 ? +C Cortex-A15 1.39 +C Cortex-A17 1.60 +C Cortex-A53 2.51 + +define(`ap', r0) +define(`n', r1) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Write cleverer summation code. +C * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l. + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + push { r4, r5, r6, r7 } + + subs n, n, #3 + mov r7, #0 + blt L(le2) C n <= 2 + + ldmia ap!, { r2, r3, r12 } + subs n, n, #3 + blt L(sum) C n <= 5 + mov r7, #0 + b L(mid) + +L(top): adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 + adc r7, r7, #0 +L(mid): ldmia ap!, { r4, r5, r6 } + subs n, n, #3 + bpl L(top) + + adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 + adc r7, r7, #0 C r7 <= 1 + +L(sum): cmn n, #2 + movlo r4, #0 + ldrhs r4, [ap], #4 + movls r5, #0 + ldrhi r5, [ap], #4 + + adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, #0 + adc r7, r7, #0 C r7 <= 2 + +L(sum2): + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + add r0, r0, r7 + + mov r7, r3, lsl #8 + bic r2, r7, #0xff000000 + add r0, r0, r2 + add r0, r0, r3, lsr #16 + + mov r2, r12, lsl #16 + bic r1, r2, #0xff000000 + add r0, r0, r1 + add r0, r0, r12, lsr #8 + + pop { r4, r5, r6, r7 } + return lr + +L(le2): cmn n, #1 + bne L(1) + ldmia ap!, { r2, r3 } + mov r12, #0 + b L(sum2) +L(1): ldr r2, [ap] + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + pop { r4, r5, r6, r7 } + return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm new file mode 100644 index 0000000..d9b6042 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm @@ -0,0 +1,34 @@ +dnl ARM mpn_mul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mul_1) +include_mpn(`arm/v6/mul_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm new file mode 100644 index 0000000..f3e8139 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm @@ -0,0 +1,34 @@ +dnl ARM mpn_submul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_submul_1) +include_mpn(`arm/v6/submul_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h new file mode 100644 index 0000000..e3564e0 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h @@ -0,0 +1,205 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1500 MHz Cortex-A5 (odroid c1) */ +/* FFT tuning limit = 18,235,562 */ +/* Generated by tuneup.c, 2019-10-22, gcc 4.9 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 23 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 132.79% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 52 + +#define DIV_1_VS_MUL_1_PERCENT 213 + +#define MUL_TOOM22_THRESHOLD 48 +#define MUL_TOOM33_THRESHOLD 143 +#define MUL_TOOM44_THRESHOLD 262 +#define MUL_TOOM6H_THRESHOLD 414 +#define MUL_TOOM8H_THRESHOLD 527 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 153 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 168 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 152 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 180 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 66 +#define SQR_TOOM3_THRESHOLD 149 +#define SQR_TOOM4_THRESHOLD 348 +#define SQR_TOOM6_THRESHOLD 517 +#define SQR_TOOM8_THRESHOLD 608 + +#define MULMID_TOOM42_THRESHOLD 70 + +#define MULMOD_BNM1_THRESHOLD 26 +#define SQRMOD_BNM1_THRESHOLD 28 + +#define MUL_FFT_MODF_THRESHOLD 660 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 660, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \ + { 37, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \ + { 43, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 51, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 99, 9}, { 55,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 703,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 895,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1183,12}, { 639,11}, \ + { 1279,12}, { 703,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1151,13}, { 639,12}, { 1407,13}, { 767,12}, \ + { 1599,13}, { 895,12}, { 1791,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2367,13}, \ + { 1279,12}, { 2559,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3327,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4351,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 140 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 590 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 590, 5}, { 33, 6}, { 17, 5}, { 35, 6}, \ + { 36, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 575, 9}, { 1151,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \ + { 1151,11}, { 607,12}, { 319,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 927,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 639,11}, { 1279,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1151,13}, { 639,12}, { 1407,13}, { 767,12}, \ + { 1599,13}, { 895,12}, { 1791,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2367,13}, \ + { 1279,12}, { 2559,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3327,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4351,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 144 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 14709 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 33 +#define SQRLO_SQR_THRESHOLD 11278 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 116 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 140 + +#define INV_MULMOD_BNM1_THRESHOLD 95 +#define INV_NEWTON_THRESHOLD 181 +#define INV_APPR_THRESHOLD 125 + +#define BINV_NEWTON_THRESHOLD 327 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ +#define REDC_2_TO_REDC_N_THRESHOLD 152 + +#define MU_DIV_QR_THRESHOLD 2350 +#define MU_DIVAPPR_Q_THRESHOLD 2130 +#define MUPI_DIV_QR_THRESHOLD 98 +#define MU_BDIV_QR_THRESHOLD 1970 +#define MU_BDIV_Q_THRESHOLD 2172 + +#define POWM_SEC_TABLE 6,37,108,624,2351 + +#define GET_STR_DC_THRESHOLD 28 +#define GET_STR_PRECOMPUTE_THRESHOLD 44 +#define SET_STR_DC_THRESHOLD 309 +#define SET_STR_PRECOMPUTE_THRESHOLD 762 + +#define FAC_DSC_THRESHOLD 236 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 5 /* 2.92% faster than 3 */ +#define HGCD_THRESHOLD 70 +#define HGCD_APPR_THRESHOLD 59 +#define HGCD_REDUCE_THRESHOLD 4120 +#define GCD_DC_THRESHOLD 229 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 1 /* 17.07% faster than 4 */ + +/* Tuneup completed successfully, took 47845 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h new file mode 100644 index 0000000..78de045 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h @@ -0,0 +1,202 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 900 MHz Cortex-A7 (raspberry pi2) */ +/* FFT tuning limit = 21,559,921 */ +/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 64.16% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 48 + +#define DIV_1_VS_MUL_1_PERCENT 216 + +#define MUL_TOOM22_THRESHOLD 39 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 196 +#define MUL_TOOM6H_THRESHOLD 327 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 144 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 190 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 52 +#define SQR_TOOM3_THRESHOLD 162 +#define SQR_TOOM4_THRESHOLD 268 +#define SQR_TOOM6_THRESHOLD 399 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 50 + +#define MULMOD_BNM1_THRESHOLD 21 +#define SQRMOD_BNM1_THRESHOLD 25 + +#define MUL_FFT_MODF_THRESHOLD 636 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 636, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 799,11}, \ + { 415,10}, { 831,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,11}, \ + { 607,12}, { 319,11}, { 735,12}, { 383,11}, \ + { 863,12}, { 447,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 639,11}, { 1279,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1215,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1663,13}, { 895,12}, { 1855,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2431,13}, { 1407,14}, { 767,13}, { 1663,12}, \ + { 3327,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2431,14}, { 1279,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 133 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 535 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 535, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { 15, 5}, { 31, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415, 9}, { 831,11}, \ + { 223,12}, { 127,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 799,11}, \ + { 415,10}, { 831,13}, { 127,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 735,12}, { 383,11}, { 863,12}, \ + { 447,11}, { 991,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1279,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,11}, { 1663,12}, { 959,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1663,13}, { 895,12}, { 1855,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2431,14}, { 1279,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 134 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 5 +#define SQRLO_DC_THRESHOLD 31 +#define SQRLO_SQR_THRESHOLD 9449 + +#define DC_DIV_QR_THRESHOLD 28 +#define DC_DIVAPPR_Q_THRESHOLD 90 +#define DC_BDIV_QR_THRESHOLD 32 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 134 +#define INV_APPR_THRESHOLD 98 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 123 + +#define MU_DIV_QR_THRESHOLD 1718 +#define MU_DIVAPPR_Q_THRESHOLD 1685 +#define MUPI_DIV_QR_THRESHOLD 62 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1718 + +#define POWM_SEC_TABLE 1,22,95,563,1955 + +#define GET_STR_DC_THRESHOLD 28 +#define GET_STR_PRECOMPUTE_THRESHOLD 51 +#define SET_STR_DC_THRESHOLD 182 +#define SET_STR_PRECOMPUTE_THRESHOLD 638 + +#define FAC_DSC_THRESHOLD 153 +#define FAC_ODD_THRESHOLD 56 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 1 /* 5.04% faster than 3 */ +#define HGCD_THRESHOLD 55 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 153 +#define GCDEXT_DC_THRESHOLD 180 +#define JACOBI_BASE_METHOD 1 /* 30.60% faster than 4 */ + +/* Tuneup completed successfully, took 75202 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm new file mode 100644 index 0000000..e74b260 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm @@ -0,0 +1,158 @@ +dnl ARM v6 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. +dnl This is v6 code but it runs well on just the v7a Cortex-A8, A9, and A15. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C 1176 - - +C Cortex-A5 9 13 +C Cortex-A7 12 18 +C Cortex-A8 13 14 +C Cortex-A9 9 10 not measured since latest edits +C Cortex-A15 7 7 +C Cortex-A53 16 24 + +C Architecture requirements: +C v5 - +C v5t clz +C v5te - +C v6 umaal +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') +define(`di_arg', `sp[0]') C just mpn_pi1_bdiv_q_1 +define(`cnt_arg', `sp[4]') C just mpn_pi1_bdiv_q_1 + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r4') + +ASM_START() +PROLOGUE(mpn_bdiv_q_1) + push {r6-r11} + + rsb r10, d, #0 + and r10, r10, d + clz r10, r10 + rsbs cnt, r10, #31 C count_trailing_zeros + mov d, d, lsr cnt + +C binvert limb + LEA( r10, binvert_limb_table) + and r12, d, #254 + ldrb r10, [r10, r12, lsr #1] + mul r12, r10, r10 + mul r12, d, r12 + rsb r12, r12, r10, lsl #1 + mul r10, r12, r12 + mul r10, d, r10 + rsb r10, r10, r12, lsl #1 C r10 = inverse + b L(pi1) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + push {r6-r11} + + ldr cnt, [sp, #28] + ldr r10, [sp, #24] + cmp cnt, #0 + +L(pi1): ldr r11, [up], #4 C up[0] + mov cy, #0 + rsb r8, r10, #0 C r8 = -inverse + bne L(unorm) + +L(norm): + subs n, n, #1 + mul r11, r11, r10 + beq L(edn) + + ALIGN(16) +L(tpn): ldr r9, [up], #4 + mov r12, #0 + str r11, [rp], #4 + umaal r12, cy, r11, d + mul r11, r9, r10 + mla r11, cy, r8, r11 + subs n, n, #1 + bne L(tpn) + +L(edn): str r11, [rp] + pop {r6-r11} + bx r14 + +L(unorm): + push {r4-r5} + rsb tnc, cnt, #32 + mov r5, r11, lsr cnt + subs n, n, #1 + beq L(ed1) + + ldr r12, [up], #4 + orr r9, r5, r12, lsl tnc + mov r5, r12, lsr cnt + mul r11, r9, r10 + subs n, n, #1 + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r5, r12, lsl tnc + mov r5, r12, lsr cnt + mov r12, #0 + str r11, [rp], #4 + umaal r12, cy, r11, d + mul r11, r9, r10 + mla r11, cy, r8, r11 + subs n, n, #1 + bne L(tpu) + +L(edu): str r11, [rp], #4 + mov r12, #0 + umaal r12, cy, r11, d + mul r11, r5, r10 + mla r11, cy, r8, r11 + str r11, [rp] + pop {r4-r11} + bx r14 + +L(ed1): mul r11, r5, r10 + str r11, [rp] + pop {r4-r11} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h new file mode 100644 index 0000000..5864841 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h @@ -0,0 +1,207 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1000 MHz Cortex-A8 (beaglebone black) */ +/* FFT tuning limit = 9,464,348 */ +/* Generated by tuneup.c, 2019-10-23, gcc 6.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 50.65% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 31 + +#define DIV_1_VS_MUL_1_PERCENT 192 + +#define MUL_TOOM22_THRESHOLD 39 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 226 +#define MUL_TOOM6H_THRESHOLD 366 +#define MUL_TOOM8H_THRESHOLD 620 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 193 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 46 +#define SQR_TOOM3_THRESHOLD 145 +#define SQR_TOOM4_THRESHOLD 375 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 38 + +#define MULMOD_BNM1_THRESHOLD 22 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 476, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 99, 9}, { 55,10}, { 31, 9}, { 87,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 199,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767, 9}, { 399,10}, { 207,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,10}, { 367,11}, \ + { 191,10}, { 399, 9}, { 799,10}, { 415,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,10}, { 863,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1663,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1407,13}, { 767,12}, \ + { 1663,13}, { 895,12}, { 1791,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 139 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 436, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 43, 9}, { 23, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 167,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 767, 9}, \ + { 399,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,10}, { 367,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671,11}, \ + { 351,10}, { 735,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 863,11}, { 447,10}, \ + { 895,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 863,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1663,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1215,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1663,13}, { 895,12}, { 1855,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 152 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 21 +#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */ +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 17 +#define SQRLO_SQR_THRESHOLD 7246 + +#define DC_DIV_QR_THRESHOLD 27 +#define DC_DIVAPPR_Q_THRESHOLD 74 +#define DC_BDIV_QR_THRESHOLD 21 +#define DC_BDIV_Q_THRESHOLD 64 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 31 +#define INV_APPR_THRESHOLD 37 + +#define BINV_NEWTON_THRESHOLD 167 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 198 + +#define MU_DIV_QR_THRESHOLD 1858 +#define MU_DIVAPPR_Q_THRESHOLD 1685 +#define MUPI_DIV_QR_THRESHOLD 43 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1685 + +#define POWM_SEC_TABLE 1,13,96,487,1378 + +#define GET_STR_DC_THRESHOLD 18 +#define GET_STR_PRECOMPUTE_THRESHOLD 36 +#define SET_STR_DC_THRESHOLD 145 +#define SET_STR_PRECOMPUTE_THRESHOLD 505 + +#define FAC_DSC_THRESHOLD 137 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 24 +#define HGCD2_DIV1_METHOD 5 /* 4.29% faster than 4 */ +#define HGCD_THRESHOLD 39 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 116 +#define GCDEXT_DC_THRESHOLD 124 +#define JACOBI_BASE_METHOD 4 /* 5.89% faster than 1 */ + +/* Tuneup completed successfully, took 48230 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm new file mode 100644 index 0000000..245b371 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm @@ -0,0 +1,36 @@ +dnl ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +include_mpn(`arm/v7a/cora8/bdiv_q_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h new file mode 100644 index 0000000..5c54012 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h @@ -0,0 +1,211 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2015 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1000 MHz Cortex-A9 */ +/* FFT tuning limit = 25 M */ +/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 190 + +#define MUL_TOOM22_THRESHOLD 45 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 387 +#define MUL_TOOM6H_THRESHOLD 537 +#define MUL_TOOM8H_THRESHOLD 774 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 237 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 141 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 258 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 211 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 64 +#define SQR_TOOM3_THRESHOLD 189 +#define SQR_TOOM4_THRESHOLD 517 +#define SQR_TOOM6_THRESHOLD 656 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 28 + +#define MUL_FFT_MODF_THRESHOLD 630 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 630, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 35, 6}, { 36, 7}, { 19, 6}, \ + { 40, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 25, 6}, { 51, 7}, { 27, 6}, \ + { 55, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 57, 9}, { 15, 8}, { 31, 7}, \ + { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \ + { 23, 8}, { 55, 9}, { 31, 8}, { 71, 9}, \ + { 39, 8}, { 83, 9}, { 47, 8}, { 99, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 103,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 167,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 159,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 927,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1023,11}, { 2047,12}, { 1151,13}, { 639,12}, \ + { 1407,13}, { 767,12}, { 1663,13}, { 895,12}, \ + { 1791,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \ + { 1407,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,12}, { 3455,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2047,12}, { 4095,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2559,12}, \ + { 5119,13}, { 2815,12}, { 5631,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 157 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 565, 5}, { 19, 4}, { 40, 5}, { 21, 4}, \ + { 43, 5}, { 28, 6}, { 15, 5}, { 35, 6}, \ + { 29, 7}, { 15, 6}, { 37, 7}, { 19, 6}, \ + { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \ + { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \ + { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511, 8}, { 1023, 9}, \ + { 527,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \ + { 543,11}, { 287,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 927,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 639,11}, { 1343,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,12}, { 959,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1151,13}, { 639,12}, { 1407,13}, \ + { 767,12}, { 1599,13}, { 895,12}, { 1791,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2559,13}, { 1407,14}, \ + { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \ + { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2047,12}, { 4095,13}, { 2175,12}, { 4479,13}, \ + { 2303,14}, { 1279,13}, { 2559,12}, { 5119,13}, \ + { 2815,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 155 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 37 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 12 +#define SQRLO_DC_THRESHOLD 22 +#define SQRLO_SQR_THRESHOLD 10950 + +#define DC_DIV_QR_THRESHOLD 32 +#define DC_DIVAPPR_Q_THRESHOLD 99 +#define DC_BDIV_QR_THRESHOLD 43 +#define DC_BDIV_Q_THRESHOLD 102 + +#define INV_MULMOD_BNM1_THRESHOLD 88 +#define INV_NEWTON_THRESHOLD 141 +#define INV_APPR_THRESHOLD 111 + +#define BINV_NEWTON_THRESHOLD 312 +#define REDC_1_TO_REDC_2_THRESHOLD 6 +#define REDC_2_TO_REDC_N_THRESHOLD 140 + +#define MU_DIV_QR_THRESHOLD 2492 +#define MU_DIVAPPR_Q_THRESHOLD 2130 +#define MUPI_DIV_QR_THRESHOLD 55 +#define MU_BDIV_QR_THRESHOLD 2130 +#define MU_BDIV_Q_THRESHOLD 2172 + +#define POWM_SEC_TABLE 40,53,56,71,1985 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_DC_THRESHOLD 172 +#define SET_STR_PRECOMPUTE_THRESHOLD 671 + +#define FAC_DSC_THRESHOLD 309 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 24 +#define HGCD_THRESHOLD 61 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 4120 +#define GCD_DC_THRESHOLD 408 +#define GCDEXT_DC_THRESHOLD 303 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/arm64/aors_n.asm b/gmp-6.3.0/mpn/arm64/aors_n.asm new file mode 100644 index 0000000..b4a6da6 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/aors_n.asm @@ -0,0 +1,125 @@ +dnl ARM64 mpn_add_n and mpn_sub_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 2.75-3.25 +C Cortex-A57 1.5 +C X-Gene 2.0 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`vp', `x2') +define(`n', `x3') + +ifdef(`OPERATION_add_n', ` + define(`ADDSUBC', adcs) + define(`CLRCY', `cmn xzr, xzr') + define(`SETCY', `cmp $1, #1') + define(`RETVAL', `cset x0, cs') + define(`func_n', mpn_add_n) + define(`func_nc', mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(`ADDSUBC', sbcs) + define(`CLRCY', `cmp xzr, xzr') + define(`SETCY', `cmp xzr, $1') + define(`RETVAL', `cset x0, cc') + define(`func_n', mpn_sub_n) + define(`func_nc', mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + SETCY( x4) + b L(ent) +EPILOGUE() +PROLOGUE(func_n) + CLRCY +L(ent): lsr x17, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x7, [up] + ldr x11, [vp] + ADDSUBC x13, x7, x11 + str x13, [rp],#8 + tbnz n, #1, L(b11) + +L(b01): cbz x17, L(ret) + ldp x4, x5, [up,#8] + ldp x8, x9, [vp,#8] + sub up, up, #8 + sub vp, vp, #8 + b L(mid) + +L(b11): ldp x6, x7, [up,#8] + ldp x10, x11, [vp,#8] + add up, up, #8 + add vp, vp, #8 + cbz x17, L(end) + b L(top) + +L(bx0): tbnz n, #1, L(b10) + +L(b00): ldp x4, x5, [up] + ldp x8, x9, [vp] + sub up, up, #16 + sub vp, vp, #16 + b L(mid) + +L(b10): ldp x6, x7, [up] + ldp x10, x11, [vp] + cbz x17, L(end) + + ALIGN(16) +L(top): ldp x4, x5, [up,#16] + ldp x8, x9, [vp,#16] + ADDSUBC x12, x6, x10 + ADDSUBC x13, x7, x11 + stp x12, x13, [rp],#16 +L(mid): ldp x6, x7, [up,#32]! + ldp x10, x11, [vp,#32]! + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + stp x12, x13, [rp],#16 + sub x17, x17, #1 + cbnz x17, L(top) + +L(end): ADDSUBC x12, x6, x10 + ADDSUBC x13, x7, x11 + stp x12, x13, [rp] +L(ret): RETVAL + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/aorsmul_1.asm b/gmp-6.3.0/mpn/arm64/aorsmul_1.asm new file mode 100644 index 0000000..81ec1da --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/aorsmul_1.asm @@ -0,0 +1,145 @@ +dnl ARM64 mpn_addmul_1 and mpn_submul_1 + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013, 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C addmul_1 submul_1 +C cycles/limb cycles/limb +C Cortex-A53 9.3-9.8 9.3-9.8 +C Cortex-A55 9.0-9.5 9.3-9.8 +C Cortex-A57 7 7 +C Cortex-A72 +C Cortex-A73 6 6 +C X-Gene 5 5 +C Apple M1 1.75 1.75 + +C NOTES +C * It is possible to keep the carry chain alive between the addition blocks +C and thus avoid csinc, but only for addmul_1. Since that saves no time +C on the tested pipelines, we keep addmul_1 and submul_1 similar. +C * We could separate feed-in into 4 blocks, one for each residue (mod 4). +C That is likely to save a few cycles. + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`n', `x2') +define(`v0', `x3') + +ifdef(`OPERATION_addmul_1', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`COND', `cc') + define(`func', mpn_addmul_1)') +ifdef(`OPERATION_submul_1', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`COND', `cs') + define(`func', mpn_submul_1)') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +PROLOGUE(func) + adds x15, xzr, xzr + + tbz n, #0, L(1) + + ldr x4, [up],#8 + mul x8, x4, v0 + umulh x12, x4, v0 + ldr x4, [rp] + ADDSUB x8, x4, x8 + csinc x15, x12, x12, COND + str x8, [rp],#8 + +L(1): tbz n, #1, L(2) + + ldp x4, x5, [up],#16 + mul x8, x4, v0 + umulh x12, x4, v0 + mul x9, x5, v0 + umulh x13, x5, v0 + adds x8, x8, x15 + adcs x9, x9, x12 + ldp x4, x5, [rp] + adc x15, x13, xzr + ADDSUB x8, x4, x8 + ADDSUBC x9, x5, x9 + csinc x15, x15, x15, COND + stp x8, x9, [rp],#16 + +L(2): lsr n, n, #2 + cbz n, L(le3) + ldp x4, x5, [up],#32 + ldp x6, x7, [up,#-16] + b L(mid) +L(le3): mov x0, x15 + ret + + ALIGN(16) +L(top): ldp x4, x5, [up],#32 + ldp x6, x7, [up,#-16] + ADDSUB x8, x16, x8 + ADDSUBC x9, x17, x9 + stp x8, x9, [rp],#32 + ADDSUBC x10, x12, x10 + ADDSUBC x11, x13, x11 + stp x10, x11, [rp,#-16] + csinc x15, x15, x15, COND +L(mid): sub n, n, #1 + mul x8, x4, v0 + umulh x12, x4, v0 + mul x9, x5, v0 + umulh x13, x5, v0 + adds x8, x8, x15 + mul x10, x6, v0 + umulh x14, x6, v0 + adcs x9, x9, x12 + mul x11, x7, v0 + umulh x15, x7, v0 + adcs x10, x10, x13 + ldp x16, x17, [rp] + adcs x11, x11, x14 + ldp x12, x13, [rp,#16] + adc x15, x15, xzr + cbnz n, L(top) + + ADDSUB x8, x16, x8 + ADDSUBC x9, x17, x9 + ADDSUBC x10, x12, x10 + ADDSUBC x11, x13, x11 + stp x8, x9, [rp] + stp x10, x11, [rp,#16] + csinc x0, x15, x15, COND + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/arm64/aorsorrlsh1_n.asm new file mode 100644 index 0000000..c617a67 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/aorsorrlsh1_n.asm @@ -0,0 +1,43 @@ +dnl ARM64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`arm64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/arm64/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/arm64/aorsorrlsh2_n.asm new file mode 100644 index 0000000..852d117 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/aorsorrlsh2_n.asm @@ -0,0 +1,43 @@ +dnl ARM64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`arm64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/arm64/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/arm64/aorsorrlshC_n.asm new file mode 100644 index 0000000..1718b77 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/aorsorrlshC_n.asm @@ -0,0 +1,139 @@ +dnl ARM64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 3.25-3.75 +C Cortex-A57 2.18 +C X-Gene 2.5 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`vp', `x2') +define(`n', `x3') + +ifdef(`DO_add', ` + define(`ADDSUB', `adds $1, $2, $3') + define(`ADDSUBC', `adcs $1, $2, $3') + define(`CLRRCY', `adds $1, xzr, xzr') + define(`RETVAL', `adc x0, $1, xzr') + define(`func_n', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUB', `subs $1, $3, $2') + define(`ADDSUBC', `sbcs $1, $3, $2') + define(`CLRRCY', `subs $1, xzr, xzr') + define(`RETVAL', `cinc x0, $1, cc') + define(`func_n', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUB', `subs $1, $2, $3') + define(`ADDSUBC', `sbcs $1, $2, $3') + define(`CLRRCY', `subs $1, xzr, xzr') + define(`RETVAL', `sbc x0, $1, xzr') + define(`func_n', mpn_rsblsh`'LSH`'_n)') + +ASM_START() +PROLOGUE(func_n) + lsr x6, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x5, [up] + tbnz n, #1, L(b11) + +L(b01): ldr x11, [vp] + cbz x6, L(1) + ldp x8, x9, [vp,#8] + lsl x13, x11, #LSH + ADDSUB( x15, x13, x5) + str x15, [rp],#8 + sub up, up, #24 + sub vp, vp, #8 + b L(mid) + +L(1): lsl x13, x11, #LSH + ADDSUB( x15, x13, x5) + str x15, [rp] + lsr x0, x11, RSH + RETVAL( x0, x1) + ret + +L(b11): ldr x9, [vp] + ldp x10, x11, [vp,#8]! + lsl x13, x9, #LSH + ADDSUB( x17, x13, x5) + str x17, [rp],#8 + sub up, up, #8 + cbz x6, L(end) + b L(top) + +L(bx0): tbnz n, #1, L(b10) + +L(b00): CLRRCY( x11) + ldp x8, x9, [vp],#-16 + sub up, up, #32 + b L(mid) + +L(b10): CLRRCY( x9) + ldp x10, x11, [vp] + sub up, up, #16 + cbz x6, L(end) + + ALIGN(16) +L(top): ldp x4, x5, [up,#16] + extr x12, x10, x9, #RSH + ldp x8, x9, [vp,#16] + extr x13, x11, x10, #RSH + ADDSUBC(x14, x12, x4) + ADDSUBC(x15, x13, x5) + stp x14, x15, [rp],#16 +L(mid): ldp x4, x5, [up,#32]! + extr x12, x8, x11, #RSH + ldp x10, x11, [vp,#32]! + extr x13, x9, x8, #RSH + ADDSUBC(x16, x12, x4) + ADDSUBC(x17, x13, x5) + stp x16, x17, [rp],#16 + sub x6, x6, #1 + cbnz x6, L(top) + +L(end): ldp x4, x5, [up,#16] + extr x12, x10, x9, #RSH + extr x13, x11, x10, #RSH + ADDSUBC(x14, x12, x4) + ADDSUBC(x15, x13, x5) + stp x14, x15, [rp] + lsr x0, x11, RSH + RETVAL( x0, x1) + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/applem1/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/arm64/applem1/addaddmul_1msb0.asm new file mode 100644 index 0000000..03cbf97 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/applem1/addaddmul_1msb0.asm @@ -0,0 +1,92 @@ +dnl ARM64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63. + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 +C Cortex-A55 +C Cortex-A57 +C Cortex-A72 +C Cortex-A73 +C X-Gene +C Apple M1 2.0 + +changecom(blah) + +define(`rp', x0) +define(`ap', x1) +define(`bp', x2) +define(`n', x3) +define(`u0', x4) +define(`v0', x5) + +C TODO +C * Use fewer distinct registers, should be trivial. + +PROLOGUE(mpn_addaddmul_1msb0) + lsr x7, n, #1 + adds x6, xzr, xzr + tbz n, #0, L(top) + + ldr x11, [ap], #8 C 0 + ldr x15, [bp], #8 C 0 + mul x10, x11, u0 C 0 + umulh x11, x11, u0 C 1 + mul x14, x15, v0 C 0 + umulh x15, x15, v0 C 1 + adds x10, x10, x14 C 0 + adcs x6, x11, x15 C 1 + str x10, [rp], #8 C 0 + cbz x7, L(end) + +L(top): ldp x11, x13, [ap], #16 C 0 1 + ldp x15, x17, [bp], #16 C 0 1 + mul x10, x11, u0 C 0 + umulh x11, x11, u0 C 1 + mul x14, x15, v0 C 0 + umulh x15, x15, v0 C 1 + adcs x10, x10, x14 C 0 + adc x11, x11, x15 C 1 + adds x10, x10, x6 C 0 + mul x12, x13, u0 C 1 + umulh x13, x13, u0 C 2 + mul x14, x17, v0 C 1 + umulh x17, x17, v0 C 2 + adcs x12, x12, x14 C 1 + adc x6, x13, x17 C 2 + adds x11, x12, x11 C 1 + stp x10, x11, [rp], #16 C 0 1 + sub x7, x7, #1 + cbnz x7, L(top) + +L(end): adc x0, x6, xzr + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/applem1/aorsmul_1.asm b/gmp-6.3.0/mpn/arm64/applem1/aorsmul_1.asm new file mode 100644 index 0000000..aa87c2a --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/applem1/aorsmul_1.asm @@ -0,0 +1,161 @@ +dnl ARM64 mpn_addmul_1 and mpn_submul_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 +C Cortex-A55 +C Cortex-A57 +C Cortex-A72 +C Cortex-A73 +C X-Gene +C Apple M1 1.25 + +changecom(blah) + +define(`rp', x0) +define(`up', x1) +define(`n', x2) +define(`v0', x3) +define(`cin',x4) + +define(`CY',x17) + +ifdef(`OPERATION_addmul_1', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`COND', `cc') + define(`func', mpn_addmul_1)') +ifdef(`OPERATION_submul_1', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`COND', `cs') + define(`func', mpn_submul_1)') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1 mpn_addmul_1c) + +ifdef(`OPERATION_addmul_1', ` +PROLOGUE(mpn_addmul_1c) + mov CY, cin + b L(ent) +EPILOGUE() +') + +PROLOGUE(func) + mov CY, #0 C W0 +L(ent): lsr x16, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x4, [up], #8 + mul x8, x4, v0 + umulh x4, x4, v0 + tbz n, #1, L(b01) + +L(b11): ldp x5,x6, [up], #16 + ldp x12,x13, [rp] + ldr x14, [rp,#16] + mul x9, x5, v0 + umulh x5, x5, v0 + mul x10, x6, v0 + umulh x6, x6, v0 + ADDSUB x8, x12, x8 + ADDSUBC x4, x13, x4 + ADDSUBC x5, x14, x5 + csinc x6, x6, x6, COND + ADDSUB x8, x8, CY + ADDSUBC x4, x4, x9 + ADDSUBC x5, x5, x10 + csinc CY, x6, x6, COND + stp x8, x4, [rp], #16 + str x5, [rp], #8 + cbnz x16, L(top) + mov x0, CY + ret + +L(b01): ldr x12, [rp] + ADDSUB x8, x12, x8 + csinc x4, x4, x4, COND + ADDSUB x8, x8, CY + csinc CY, x4, x4, COND + str x8, [rp], #8 + cbnz x16, L(top) + mov x0, CY + ret + +L(bx0): ldp x4,x5, [up], #16 + tbz n, #1, L(top)+4 + +L(b10): ldp x12,x13, [rp] + mul x8, x4, v0 + umulh x4, x4, v0 + mul x9, x5, v0 + umulh x5, x5, v0 + ADDSUB x8, x12, x8 + ADDSUBC x4, x13, x4 + csinc x5, x5, x5, COND + ADDSUB x8, x8, CY + ADDSUBC x4, x4, x9 + csinc CY, x5, x5, COND + stp x8, x4, [rp], #16 + cbz x16, L(done) + +L(top): ldp x4,x5, [up], #16 C W0 W1 + ldp x6,x7, [up], #16 C W2 W3 + ldp x12,x13, [rp] C W0 W1 + ldp x14,x15, [rp,#16] C W2 W3 + mul x8, x4, v0 C W0 + umulh x4, x4, v0 C W1 + mul x9, x5, v0 C W1 + umulh x5, x5, v0 C W2 + mul x10, x6, v0 C W2 + umulh x6, x6, v0 C W3 + mul x11, x7, v0 C W3 + umulh x7, x7, v0 C W4 + ADDSUB x8, x12, x8 C W0 + ADDSUBC x4, x13, x4 C W1 + ADDSUBC x5, x14, x5 C W2 + ADDSUBC x6, x15, x6 C W3 + csinc x7, x7, x7, COND C W4 + ADDSUB x8, x8, CY C W0 carry-in + ADDSUBC x4, x4, x9 C W1 + ADDSUBC x5, x5, x10 C W2 + ADDSUBC x6, x6, x11 C W2 + csinc CY, x7, x7, COND C W3 carry-out + stp x8, x4, [rp], #16 + stp x5, x6, [rp], #16 + sub x16, x16, #1 + cbnz x16, L(top) + +L(done):mov x0, CY + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/applem1/gmp-mparam.h b/gmp-6.3.0/mpn/arm64/applem1/gmp-mparam.h new file mode 100644 index 0000000..d08262f --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/applem1/gmp-mparam.h @@ -0,0 +1,187 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3200 MHz Apple M1 */ +/* FFT tuning limit = 1 M */ +/* Generated by tuneup.c, 2020-12-25, gcc 4.2 */ + +#define MOD_1_1P_METHOD 2 /* 42.96% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 /* native */ +/* From m1.gmplib.org, 2023-07-21 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 13.35% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD 9 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 28 + +#define DIV_1_VS_MUL_1_PERCENT 659 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 77 +#define MUL_TOOM44_THRESHOLD 153 +#define MUL_TOOM6H_THRESHOLD 446 +#define MUL_TOOM8H_THRESHOLD 626 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 94 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 81 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 41 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 99 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 133 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 47 +#define SQR_TOOM3_THRESHOLD 74 +#define SQR_TOOM4_THRESHOLD 372 +#define SQR_TOOM6_THRESHOLD 462 +#define SQR_TOOM8_THRESHOLD 592 + +#define MULMID_TOOM42_THRESHOLD 44 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 216 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 216, 5}, { 7, 4}, { 19, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,11}, \ + { 79,10}, { 159, 9}, { 319, 8}, { 639,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511, 8}, { 1023,10}, \ + { 271, 9}, { 543, 8}, { 1087,11}, { 143,10}, \ + { 287, 9}, { 575, 8}, { 1151,11}, { 159,10}, \ + { 319, 9}, { 639,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,11}, { 271,10}, { 543, 9}, \ + { 1087, 8}, { 2175,11}, { 287,10}, { 575, 9}, \ + { 1151,12}, { 159,11}, { 319,10}, { 639, 9}, \ + { 1279,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,10}, \ + { 959,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 104 +#define MUL_FFT_THRESHOLD 2368 + +#define SQR_FFT_MODF_THRESHOLD 304 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 304, 5}, { 10, 4}, { 21, 5}, { 11, 4}, \ + { 23, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,11}, \ + { 79,10}, { 159, 9}, { 319, 8}, { 639,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,10}, \ + { 255, 9}, { 511, 8}, { 1023,10}, { 271, 9}, \ + { 543, 8}, { 1087,10}, { 287, 9}, { 575, 8}, \ + { 1151,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,11}, { 271,10}, { 543, 9}, \ + { 1087, 8}, { 2175,10}, { 575, 9}, { 1151,11}, \ + { 303,12}, { 159,11}, { 319,10}, { 639, 9}, \ + { 1279,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831, 9}, { 1663,12}, { 223,11}, { 447,10}, \ + { 895,11}, { 479,10}, { 959, 9}, { 1919,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 111 +#define SQR_FFT_THRESHOLD 1856 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 76 +#define MULLO_MUL_N_THRESHOLD 4292 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 186 +#define SQRLO_SQR_THRESHOLD 3688 + +#define DC_DIV_QR_THRESHOLD 67 +#define DC_DIVAPPR_Q_THRESHOLD 242 +#define DC_BDIV_QR_THRESHOLD 68 +#define DC_BDIV_Q_THRESHOLD 129 + +#define INV_MULMOD_BNM1_THRESHOLD 82 +#define INV_NEWTON_THRESHOLD 157 +#define INV_APPR_THRESHOLD 157 + +#define BINV_NEWTON_THRESHOLD 99 +#define REDC_1_TO_REDC_N_THRESHOLD 68 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 1210 +#define MUPI_DIV_QR_THRESHOLD 76 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1341 + +#define POWM_SEC_TABLE 11,75,137,712,2177 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 632 +#define SET_STR_PRECOMPUTE_THRESHOLD 1215 + +#define FAC_DSC_THRESHOLD 252 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 9 +#define HGCD2_DIV1_METHOD 1 /* 8.52% faster than 3 */ +#define HGCD_THRESHOLD 131 +#define HGCD_APPR_THRESHOLD 144 +#define HGCD_REDUCE_THRESHOLD 1962 +#define GCD_DC_THRESHOLD 435 +#define GCDEXT_DC_THRESHOLD 199 +#define JACOBI_BASE_METHOD 4 /* 0.80% faster than 1 */ diff --git a/gmp-6.3.0/mpn/arm64/applem1/sqr_basecase.asm b/gmp-6.3.0/mpn/arm64/applem1/sqr_basecase.asm new file mode 100644 index 0000000..22246cf --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/applem1/sqr_basecase.asm @@ -0,0 +1,318 @@ +dnl ARM64 mpn_sqr_basecase + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl TODO +dnl * Replace the mul_1 code with less scheduled and thus simpler code. If +dnl we base it on the addmul_1 loop, the corner code could benefit from +dnl similar incoming register state, which could eliminate some loads. +dnl * Handle n = 4 early. +dnl * Duplicate addmul loop into 4 loops which fall into each other. Perhaps +dnl stick to one mul_1 loop, but do the (mod 4) stuff at its end instead of +dnl its beginning. + +define(`rp', `x0') +define(`up', `x1') +define(`n', `x2') + +define(`v0', `x3') +define(`CY', `x17') + +PROLOGUE(mpn_sqr_basecase) + cmp n, #3 + b.ls L(le3) + + ldr v0, [up],#8 + sub n, n, #1 + mul x6, v0, v0 + umulh x4, v0, v0 + str x6, [rp],#8 + lsl v0, v0, 1 + lsl n, n, #3 + lsr x16, n, #5 + tbnz n, #3, L(mbx1) + +L(mbx0):adds x11, x4, xzr C move and clear cy + tbz n, #4, L(mb00) + +L(mb10):ldp x4, x5, [up],#16 + mul x8, x4, v0 + umulh x10, x4, v0 + cbz x16, L(m2e) + ldp x6, x7, [up],#16 + mul x9, x5, v0 + b L(mmid)-8 + +L(mbx1):ldr x7, [up],#8 + mul x9, x7, v0 + umulh x11, x7, v0 + adds x9, x9, x4 + str x9, [rp],#8 + tbnz n, #4, L(mb10) +L(mb00):ldp x6, x7, [up],#16 + mul x8, x6, v0 + umulh x10, x6, v0 + ldp x4, x5, [up],#16 + mul x9, x7, v0 + adcs x12, x8, x11 + umulh x11, x7, v0 + sub x16, x16, #1 + cbz x16, L(mend) + + ALIGN(16) +L(mtop):mul x8, x4, v0 + ldp x6, x7, [up],#16 + adcs x13, x9, x10 + umulh x10, x4, v0 + mul x9, x5, v0 + stp x12, x13, [rp],#16 + adcs x12, x8, x11 + umulh x11, x5, v0 +L(mmid):mul x8, x6, v0 + ldp x4, x5, [up],#16 + adcs x13, x9, x10 + umulh x10, x6, v0 + mul x9, x7, v0 + stp x12, x13, [rp],#16 + adcs x12, x8, x11 + umulh x11, x7, v0 + sub x16, x16, #1 + cbnz x16, L(mtop) + +L(mend):mul x8, x4, v0 + adcs x13, x9, x10 + umulh x10, x4, v0 + stp x12, x13, [rp],#16 +L(m2e): mul x9, x5, v0 + adcs x12, x8, x11 + umulh x11, x5, v0 + adcs x13, x9, x10 + stp x12, x13, [rp],#16 + adc x11, x11, xzr + str x11, [rp],#8 + +L(outer): + sub n, n, #8 + sub rp, rp, n + sub up, up, n + ldp x6, x7, [up,#-16] + ldr v0, [rp,#-8] + and x8, x7, x6, asr 63 + mul x9, x7, x7 + adds v0, v0, x8 + umulh x4, x7, x7 + adc x4, x4, xzr + adds v0, v0, x9 + str v0, [rp,#-8] + adc CY, x4, xzr + adds xzr, x6, x6 + adc v0, x7, x7 + cmp n, #16 + beq L(cor2) + + lsr x16, n, #5 + tbz n, #3, L(bx0) + +L(bx1): ldr x4, [up],#8 + mul x8, x4, v0 + umulh x4, x4, v0 + tbz n, #4, L(b01) + +L(b11): ldp x5, x6, [up],#16 + ldp x12, x13, [rp] + ldr x14, [rp,#16] + mul x9, x5, v0 + umulh x5, x5, v0 + mul x10, x6, v0 + umulh x6, x6, v0 + adds x8, x12, x8 + adcs x4, x13, x4 + adcs x5, x14, x5 + adc x6, x6, xzr + adds x8, x8, CY + adcs x4, x4, x9 + adcs x5, x5, x10 + adc CY, x6, xzr + stp x8, x4, [rp],#16 + str x5, [rp],#8 + cbnz x16, L(top) + b L(end) + +L(b01): ldr x12, [rp] + adds x8, x12, x8 + adc x4, x4, xzr + adds x8, x8, CY + adc CY, x4, xzr + str x8, [rp],#8 + b L(top) + +L(bx0): ldp x4, x5, [up],#16 + tbz n, #4, L(top)+4 + +L(b10): ldp x12, x13, [rp] + mul x8, x4, v0 + umulh x4, x4, v0 + mul x9, x5, v0 + umulh x5, x5, v0 + adds x8, x12, x8 + adcs x4, x13, x4 + adc x5, x5, xzr + adds x8, x8, CY + adcs x4, x4, x9 + adc CY, x5, xzr + stp x8, x4, [rp],#16 + + ALIGN(16) +L(top): ldp x4, x5, [up],#16 + ldp x6, x7, [up],#16 + ldp x12, x13, [rp] + ldp x14, x15, [rp,#16] + mul x8, x4, v0 + umulh x4, x4, v0 + mul x9, x5, v0 + umulh x5, x5, v0 + mul x10, x6, v0 + umulh x6, x6, v0 + mul x11, x7, v0 + umulh x7, x7, v0 + adds x8, x12, x8 + adcs x4, x13, x4 + adcs x5, x14, x5 + adcs x6, x15, x6 + adc x7, x7, xzr + adds x8, x8, CY + adcs x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc CY, x7, xzr + stp x8, x4, [rp],#16 + stp x5, x6, [rp],#16 + sub x16, x16, #1 + cbnz x16, L(top) + +L(end): str CY, [rp],#8 + b L(outer) + +L(cor2):ldp x10, x11, [up] + ldp x12, x13, [rp] + mul x8, x10, v0 + umulh x4, x10, v0 + mul x9, x11, v0 + umulh x5, x11, v0 + adds x8, x12, x8 + adcs x4, x13, x4 + adc x5, x5, xzr + adds x8, x8, CY + adcs x13, x4, x9 + adc x12, x5, xzr + str x8, [rp] + and x8, x10, x7, asr 63 + mul x9, x10, x10 + adds x13, x13, x8 + umulh x4, x10, x10 + adc x4, x4, xzr + adds x13, x13, x9 + adc CY, x4, xzr + adds xzr, x7, x7 + adc v0, x10, x10 + mul x8, x11, v0 + umulh x4, x11, v0 + adds x8, x12, x8 + adc x4, x4, xzr + adds x8, x8, CY + adc v0, x4, xzr + stp x13, x8, [rp,#8] + and x2, x11, x10, asr 63 + mul x5, x11, x11 + adds v0, v0, x2 + umulh x4, x11, x11 + adc x4, x4, xzr + adds v0, v0, x5 + adc x4, x4, xzr + stp v0, x4, [rp,#24] + ret + +L(le3): ldr v0, [up] + mul x4, v0, v0 C W0 + umulh x5, v0, v0 C W1 + cmp n, #2 + b.hs L(2o3) + stp x4, x5, [rp] + ret + +L(2o3): ldr x6, [up,#8] + mul x7, x6, x6 C W2 + umulh x8, x6, x6 C W3 + mul x9, v0, x6 C W1+1/64 + umulh x10, v0, x6 C W2+1/64 + b.hi L(3) + adds x5, x5, x9 C W1 + adcs x7, x7, x10 C W2 + adc x8, x8, xzr C W3 + adds x5, x5, x9 C W1 + adcs x7, x7, x10 C W2 + adc x8, x8, xzr C W3 + stp x4, x5, [rp] + stp x7, x8, [rp,#16] + ret + +L(3): ldr x11, [up,#16] + mul x12, x11, x11 C W4 + umulh x13, x11, x11 C W5 + mul x14, v0, x11 C W2+1/64 + umulh x15, v0, x11 C W3+1/64 + mul x16, x6, x11 C W3+1/64 + umulh x17, x6, x11 C W4+1/64 + adds x5, x5, x9 + adcs x7, x7, x10 + adcs x8, x8, x15 + adcs x12, x12, x17 + adc x13, x13, xzr + adds x5, x5, x9 + adcs x7, x7, x10 + adcs x8, x8, x15 + adcs x12, x12, x17 + adc x13, x13, xzr + adds x7, x7, x14 + adcs x8, x8, x16 + adcs x12, x12, xzr + adc x13, x13, xzr + adds x7, x7, x14 + adcs x8, x8, x16 + adcs x12, x12, xzr + adc x13, x13, xzr + stp x4, x5, [rp] + stp x7, x8, [rp,#16] + stp x12, x13, [rp,#32] + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/arm64-defs.m4 b/gmp-6.3.0/mpn/arm64/arm64-defs.m4 new file mode 100644 index 0000000..46149f7 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/arm64-defs.m4 @@ -0,0 +1,53 @@ +divert(-1) + +dnl m4 macros for ARM64 ELF assembler. + +dnl Copyright 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Standard commenting is with @, the default m4 # is for constants and we +dnl don't want to disable macro expansions in or after them. + +changecom + + +dnl LEA_HI(reg,gmp_symbol), LEA_LO(reg,gmp_symbol) +dnl +dnl Load the address of gmp_symbol into a register. We split this into two +dnl parts to allow separation for manual insn scheduling. + +ifdef(`PIC',`dnl +define(`LEA_HI', `adrp $1, :got:$2')dnl +define(`LEA_LO', `ldr $1, [$1, #:got_lo12:$2]')dnl +',`dnl +define(`LEA_HI', `adrp $1, $2')dnl +define(`LEA_LO', `add $1, $1, :lo12:$2')dnl +')dnl + +divert`'dnl diff --git a/gmp-6.3.0/mpn/arm64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/arm64/bdiv_dbm1c.asm new file mode 100644 index 0000000..78984b4 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/bdiv_dbm1c.asm @@ -0,0 +1,111 @@ +dnl ARM64 mpn_bdiv_dbm1c. + +dnl Copyright 2008, 2011, 2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 8 +C Cortex-A57 7 +C X-Gene 4.25 + +define(`qp', `x0') +define(`up', `x1') +define(`n', `x2') +define(`bd', `x3') +define(`cy', `x4') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_dbm1c) + ldr x5, [up], #8 + ands x6, n, #3 + b.eq L(fi0) + cmp x6, #2 + b.cc L(fi1) + b.eq L(fi2) + +L(fi3): mul x12, x5, bd + umulh x13, x5, bd + ldr x5, [up], #8 + b L(lo3) + +L(fi0): mul x10, x5, bd + umulh x11, x5, bd + ldr x5, [up], #8 + b L(lo0) + +L(fi1): subs n, n, #1 + mul x12, x5, bd + umulh x13, x5, bd + b.ls L(wd1) + ldr x5, [up], #8 + b L(lo1) + +L(fi2): mul x10, x5, bd + umulh x11, x5, bd + ldr x5, [up], #8 + b L(lo2) + +L(top): ldr x5, [up], #8 + subs x4, x4, x10 + str x4, [qp], #8 + sbc x4, x4, x11 +L(lo1): mul x10, x5, bd + umulh x11, x5, bd + ldr x5, [up], #8 + subs x4, x4, x12 + str x4, [qp], #8 + sbc x4, x4, x13 +L(lo0): mul x12, x5, bd + umulh x13, x5, bd + ldr x5, [up], #8 + subs x4, x4, x10 + str x4, [qp], #8 + sbc x4, x4, x11 +L(lo3): mul x10, x5, bd + umulh x11, x5, bd + ldr x5, [up], #8 + subs x4, x4, x12 + str x4, [qp], #8 + sbc x4, x4, x13 +L(lo2): subs n, n, #4 + mul x12, x5, bd + umulh x13, x5, bd + b.hi L(top) + +L(wd2): subs x4, x4, x10 + str x4, [qp], #8 + sbc x4, x4, x11 +L(wd1): subs x4, x4, x12 + str x4, [qp] + sbc x0, x4, x13 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm new file mode 100644 index 0000000..7fffc93 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/bdiv_q_1.asm @@ -0,0 +1,122 @@ +dnl ARM64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C Cortex-A53 12 15 +C Cortex-A57 12 12 +C Cortex-A72 +C Cortex-A73 +C X-Gene 11 11 + +C TODO +C * Scheduling of umulh later in the unorm loop brings A53 time to 12 c/l. +C Unfortunately, that requires software pipelining. + +define(`rp', `x0') +define(`up', `x1') +define(`n', `x2') +define(`d', `x3') +define(`di', `x4') C just mpn_pi1_bdiv_q_1 +define(`cnt', `x5') C just mpn_pi1_bdiv_q_1 + +define(`cy', `r7') +define(`tnc', `x8') + +ASM_START() +PROLOGUE(mpn_bdiv_q_1) + + rbit x6, d + clz cnt, x6 + lsr d, d, cnt + + LEA_HI( x7, binvert_limb_table) + ubfx x6, d, 1, 7 + LEA_LO( x7, binvert_limb_table) + ldrb w6, [x7, x6] + ubfiz x7, x6, 1, 8 + umull x6, w6, w6 + msub x6, x6, d, x7 + lsl x7, x6, 1 + mul x6, x6, x6 + msub x6, x6, d, x7 + lsl x7, x6, 1 + mul x6, x6, x6 + msub di, x6, d, x7 + + b GSYM_PREFIX`'mpn_pi1_bdiv_q_1 +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + sub n, n, #1 + subs x6, x6, x6 C clear r6 and C flag + ldr x9, [up],#8 + cbz cnt, L(norm) + +L(unorm): + lsr x12, x9, cnt + cbz n, L(eu1) + sub tnc, xzr, cnt + +L(tpu): ldr x9, [up],#8 + lsl x7, x9, tnc + orr x7, x7, x12 + sbcs x6, x7, x6 + mul x7, x6, di + str x7, [rp],#8 + lsr x12, x9, cnt + umulh x6, x7, d + sub n, n, #1 + cbnz n, L(tpu) + +L(eu1): sbcs x6, x12, x6 + mul x6, x6, di + str x6, [rp] + ret + +L(norm): + mul x5, x9, di + str x5, [rp],#8 + cbz n, L(en1) + +L(tpn): ldr x9, [up],#8 + umulh x5, x5, d + sbcs x5, x9, x5 + mul x5, x5, di + str x5, [rp],#8 + sub n, n, #1 + cbnz n, L(tpn) + +L(en1): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/cnd_aors_n.asm b/gmp-6.3.0/mpn/arm64/cnd_aors_n.asm new file mode 100644 index 0000000..397aa51 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/cnd_aors_n.asm @@ -0,0 +1,129 @@ +dnl ARM64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 3.87-4.37 +C Cortex-A57 1.75 +C X-Gene 2.0 + +changecom(blah) + +define(`cnd', `x0') +define(`rp', `x1') +define(`up', `x2') +define(`vp', `x3') +define(`n', `x4') + +ifdef(`OPERATION_cnd_add_n', ` + define(`ADDSUBC', adcs) + define(`CLRCY', `cmn xzr, xzr') + define(`RETVAL', `cset x0, cs') + define(`func', mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n', ` + define(`ADDSUBC', sbcs) + define(`CLRCY', `cmp xzr, xzr') + define(`RETVAL', `cset x0, cc') + define(`func', mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + cmp cnd, #1 + sbc cnd, cnd, cnd + + CLRCY + + lsr x17, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x13, [vp] + ldr x11, [up] + bic x7, x13, cnd + ADDSUBC x9, x11, x7 + str x9, [rp] + tbnz n, #1, L(b11) + +L(b01): cbz x17, L(rt) + ldp x12, x13, [vp,#8] + ldp x10, x11, [up,#8] + sub up, up, #8 + sub vp, vp, #8 + sub rp, rp, #24 + b L(mid) + +L(b11): ldp x12, x13, [vp,#8]! + ldp x10, x11, [up,#8]! + sub rp, rp, #8 + cbz x17, L(end) + b L(top) + +L(bx0): ldp x12, x13, [vp] + ldp x10, x11, [up] + tbnz n, #1, L(b10) + +L(b00): sub up, up, #16 + sub vp, vp, #16 + sub rp, rp, #32 + b L(mid) + +L(b10): sub rp, rp, #16 + cbz x17, L(end) + + ALIGN(16) +L(top): bic x6, x12, cnd + bic x7, x13, cnd + ldp x12, x13, [vp,#16] + ADDSUBC x8, x10, x6 + ADDSUBC x9, x11, x7 + ldp x10, x11, [up,#16] + stp x8, x9, [rp,#16] +L(mid): bic x6, x12, cnd + bic x7, x13, cnd + ldp x12, x13, [vp,#32]! + ADDSUBC x8, x10, x6 + ADDSUBC x9, x11, x7 + ldp x10, x11, [up,#32]! + stp x8, x9, [rp,#32]! + sub x17, x17, #1 + cbnz x17, L(top) + +L(end): bic x6, x12, cnd + bic x7, x13, cnd + ADDSUBC x8, x10, x6 + ADDSUBC x9, x11, x7 + stp x8, x9, [rp,#16] +L(rt): RETVAL + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/com.asm b/gmp-6.3.0/mpn/arm64/com.asm new file mode 100644 index 0000000..d594943 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/com.asm @@ -0,0 +1,92 @@ +dnl ARM64 mpn_com. + +dnl Copyright 2013, 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 +C Cortex-A55 +C Cortex-A57 +C Cortex-A72 +C Cortex-A73 +C X-Gene +C Apple M1 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`n', `x2') + +ASM_START() +PROLOGUE(mpn_com) + cmp n, #3 + b.le L(bc) + +C Copy until rp is 128-bit aligned + tbz rp, #3, L(al2) + ldr x4, [up],#8 + sub n, n, #1 + mvn x4, x4 + str x4, [rp],#8 + +L(al2): ldp x4,x5, [up],#16 + sub n, n, #6 + tbnz n, #63, L(end) + + ALIGN(16) +L(top): ldp x6,x7, [up],#32 + mvn x4, x4 + mvn x5, x5 + stp x4,x5, [rp],#32 + ldp x4,x5, [up,#-16] + mvn x6, x6 + mvn x7, x7 + stp x6,x7, [rp,#-16] + sub n, n, #4 + tbz n, #63, L(top) + +L(end): mvn x4, x4 + mvn x5, x5 + stp x4,x5, [rp],#16 + +C Copy last 0-3 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tbz n, #1, L(tl1) + ldp x4,x5, [up],#16 + mvn x4, x4 + mvn x5, x5 + stp x4,x5, [rp],#16 +L(tl1): tbz n, #0, L(tl2) + ldr x4, [up] + mvn x4, x4 + str x4, [rp] +L(tl2): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/copyd.asm b/gmp-6.3.0/mpn/arm64/copyd.asm new file mode 100644 index 0000000..d542970 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/copyd.asm @@ -0,0 +1,85 @@ +dnl ARM64 mpn_copyd. + +dnl Copyright 2013, 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 1.8 +C Cortex-A55 1.28 +C Cortex-A57 +C Cortex-A72 1 +C Cortex-A73 1.1-1.35 (alignment dependent) +C X-Gene 1 +C Apple M1 0.31 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`n', `x2') + +ASM_START() +PROLOGUE(mpn_copyd) + add rp, rp, n, lsl #3 + add up, up, n, lsl #3 + + cmp n, #3 + b.le L(bc) + +C Copy until rp is 128-bit aligned + tbz rp, #3, L(al2) + ldr x4, [up,#-8]! + sub n, n, #1 + str x4, [rp,#-8]! + +L(al2): ldp x4,x5, [up,#-16]! + sub n, n, #6 + tbnz n, #63, L(end) + + ALIGN(16) +L(top): ldp x6,x7, [up,#-16] + stp x4,x5, [rp,#-16] + ldp x4,x5, [up,#-32]! + stp x6,x7, [rp,#-32]! + sub n, n, #4 + tbz n, #63, L(top) + +L(end): stp x4,x5, [rp,#-16]! + +C Copy last 0-3 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tbz n, #1, L(tl1) + ldp x4,x5, [up,#-16]! + stp x4,x5, [rp,#-16]! +L(tl1): tbz n, #0, L(tl2) + ldr x4, [up,#-8] + str x4, [rp,#-8] +L(tl2): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/copyi.asm b/gmp-6.3.0/mpn/arm64/copyi.asm new file mode 100644 index 0000000..0de40c5 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/copyi.asm @@ -0,0 +1,82 @@ +dnl ARM64 mpn_copyi. + +dnl Copyright 2013, 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 1.8 +C Cortex-A55 1.28 +C Cortex-A57 +C Cortex-A72 1 +C Cortex-A73 1.1-1.35 (alignment dependent) +C X-Gene 1 +C Apple M1 0.31 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`n', `x2') + +ASM_START() +PROLOGUE(mpn_copyi) + cmp n, #3 + b.le L(bc) + +C Copy until rp is 128-bit aligned + tbz rp, #3, L(al2) + ldr x4, [up],#8 + sub n, n, #1 + str x4, [rp],#8 + +L(al2): ldp x4,x5, [up],#16 + sub n, n, #6 + tbnz n, #63, L(end) + + ALIGN(16) +L(top): ldp x6,x7, [up],#32 + stp x4,x5, [rp],#32 + ldp x4,x5, [up,#-16] + stp x6,x7, [rp,#-16] + sub n, n, #4 + tbz n, #63, L(top) + +L(end): stp x4,x5, [rp],#16 + +C Copy last 0-3 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tbz n, #1, L(tl1) + ldp x4,x5, [up],#16 + stp x4,x5, [rp],#16 +L(tl1): tbz n, #0, L(tl2) + ldr x4, [up] + str x4, [rp] +L(tl2): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/cora53/cnd_aors_n.asm b/gmp-6.3.0/mpn/arm64/cora53/cnd_aors_n.asm new file mode 100644 index 0000000..1b227da --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/cora53/cnd_aors_n.asm @@ -0,0 +1,99 @@ +dnl ARM64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 3.5-4 +C Cortex-A57 2.25 +C X-Gene 3.5 + +changecom(blah) + +define(`cnd', `x0') +define(`rp', `x1') +define(`up', `x2') +define(`vp', `x3') +define(`n', `x4') + +ifdef(`OPERATION_cnd_add_n', ` + define(`ADDSUBC', adcs) + define(`CLRCY', `cmn xzr, xzr') + define(`RETVAL', `cset x0, cs') + define(`func', mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n', ` + define(`ADDSUBC', sbcs) + define(`CLRCY', `cmp xzr, xzr') + define(`RETVAL', `cset x0, cc') + define(`func', mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + cmp cnd, #1 + sbc cnd, cnd, cnd + + CLRCY C really only needed for n = 0 (mod 4) + + tbz n, #0, L(1) + ldr x10, [up], #8 + ldr x12, [vp], #8 + bic x6, x12, cnd + ADDSUBC x8, x10, x6 + sub n, n, #1 + str x8, [rp], #8 + cbz n, L(rt) + +L(1): ldp x10, x11, [up], #16 + ldp x12, x13, [vp], #16 + sub n, n, #2 + cbz n, L(end) + +L(top): bic x6, x12, cnd + bic x7, x13, cnd + ldp x12, x13, [vp], #16 + ADDSUBC x8, x10, x6 + ADDSUBC x9, x11, x7 + ldp x10, x11, [up], #16 + sub n, n, #2 + stp x8, x9, [rp], #16 + cbnz n, L(top) + +L(end): bic x6, x12, cnd + bic x7, x13, cnd + ADDSUBC x8, x10, x6 + ADDSUBC x9, x11, x7 + stp x8, x9, [rp] +L(rt): RETVAL + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/cora53/gmp-mparam.h b/gmp-6.3.0/mpn/arm64/cora53/gmp-mparam.h new file mode 100644 index 0000000..f4e258d --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/cora53/gmp-mparam.h @@ -0,0 +1,242 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file for a53. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1536 MHz Cortex-A53 */ +/* FFT tuning limit = 21,583,800 */ +/* Generated by tuneup.c, 2019-10-22, gcc 5.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 2 /* 4.84% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 39.05% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 21 +#define DIV_QR_1_UNNORM_THRESHOLD 21 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 38 + +#define DIV_1_VS_MUL_1_PERCENT 161 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 49 +#define MUL_TOOM44_THRESHOLD 73 +#define MUL_TOOM6H_THRESHOLD 173 +#define MUL_TOOM8H_THRESHOLD 236 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 65 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 68 +#define SQR_TOOM4_THRESHOLD 183 +#define SQR_TOOM6_THRESHOLD 230 +#define SQR_TOOM8_THRESHOLD 357 + +#define MULMID_TOOM42_THRESHOLD 23 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 316, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 17, 8}, \ + { 9, 7}, { 20, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 99,10}, { 55,11}, \ + { 31,10}, { 63, 8}, { 255,10}, { 71, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319,10}, \ + { 87,11}, { 47,10}, { 95, 9}, { 191, 8}, \ + { 383,10}, { 103, 9}, { 207, 8}, { 415,10}, \ + { 111, 9}, { 223,12}, { 31,11}, { 63, 9}, \ + { 255, 8}, { 511,10}, { 135, 9}, { 287, 8}, \ + { 575,11}, { 79,10}, { 159, 9}, { 319, 8}, \ + { 639,10}, { 175, 9}, { 351, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 767,10}, \ + { 207, 9}, { 415, 8}, { 831,10}, { 223, 9}, \ + { 447,12}, { 63,10}, { 255, 9}, { 511, 8}, \ + { 1023, 9}, { 543,10}, { 287, 9}, { 575, 8}, \ + { 1151,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351, 9}, { 703, 8}, { 1407,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,13}, { 63,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 575, 9}, { 1151,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \ + { 1407, 8}, { 2815,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959, 9}, \ + { 1919,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,12}, { 287,11}, { 575,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407, 9}, { 2815,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,10}, \ + { 1663,12}, { 447,11}, { 895,10}, { 1791,12}, \ + { 479,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,13}, { 319,12}, { 703,11}, { 1407,10}, \ + { 2815,13}, { 383,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1151,13}, { 703,12}, { 1407,11}, \ + { 2815,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,13}, { 959,12}, { 1919,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,14}, { 639,13}, { 1407,12}, { 2815,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 217 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 276 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 276, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 17, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 15, 7}, { 31, 8}, { 19, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95, 8}, { 191,10}, \ + { 55,11}, { 31,10}, { 63, 8}, { 255,10}, \ + { 71, 9}, { 143, 8}, { 287,10}, { 79, 9}, \ + { 159,11}, { 47,10}, { 95, 9}, { 191, 8}, \ + { 383, 7}, { 767,10}, { 103,12}, { 31,11}, \ + { 63, 9}, { 255, 8}, { 511, 7}, { 1023,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415, 8}, { 831,10}, \ + { 223, 9}, { 447,12}, { 63,10}, { 255, 9}, \ + { 511, 8}, { 1023,11}, { 143,10}, { 287, 9}, \ + { 575, 8}, { 1151,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 175,10}, { 351, 9}, { 703,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,13}, { 63,11}, { 255,10}, { 511, 9}, \ + { 1023,11}, { 287,10}, { 575, 9}, { 1151,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959,12}, \ + { 255,11}, { 511,10}, { 1023,12}, { 287,11}, \ + { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,10}, \ + { 1663,12}, { 447,11}, { 895,12}, { 479,11}, \ + { 959,10}, { 1919,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,13}, { 319,12}, { 703,11}, { 1407,10}, \ + { 2815,13}, { 383,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1151,13}, { 703,12}, { 1407,11}, { 2815,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 895,12}, \ + { 1791,13}, { 959,12}, { 1919,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1407,12}, { 2815,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1919,12}, { 3839,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 204 +#define SQR_FFT_THRESHOLD 2688 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 38 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 67 +#define SQRLO_SQR_THRESHOLD 5240 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 155 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 89 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 163 +#define INV_APPR_THRESHOLD 161 + +#define BINV_NEWTON_THRESHOLD 196 +#define REDC_1_TO_REDC_N_THRESHOLD 43 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 998 +#define MUPI_DIV_QR_THRESHOLD 91 +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 924 + +#define POWM_SEC_TABLE 6,30,125,579,1730 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 802 +#define SET_STR_PRECOMPUTE_THRESHOLD 1815 + +#define FAC_DSC_THRESHOLD 258 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 10 +#define HGCD2_DIV1_METHOD 1 /* 7.05% faster than 3 */ +#define HGCD_THRESHOLD 107 +#define HGCD_APPR_THRESHOLD 112 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 324 +#define GCDEXT_DC_THRESHOLD 242 +#define JACOBI_BASE_METHOD 4 /* 22.41% faster than 1 */ + +/* Tuneup completed successfully, took 66624 seconds */ diff --git a/gmp-6.3.0/mpn/arm64/cora57/gmp-mparam.h b/gmp-6.3.0/mpn/arm64/cora57/gmp-mparam.h new file mode 100644 index 0000000..e034f02 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/cora57/gmp-mparam.h @@ -0,0 +1,188 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file for a57, a72-a75. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1800 MHz Cortex-A72 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-10-02, gcc 7.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 1 /* 2.21% faster than 2 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 42 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define USE_PREINV_DIVREM_1 1 +/* From gcc117.osuosl.org, 2023-07-27 */ +#define DIV_QR_1N_PI1_METHOD 4 /* 8.57% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD 5 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 33 + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 41 +#define MUL_TOOM44_THRESHOLD 99 +#define MUL_TOOM6H_THRESHOLD 142 +#define MUL_TOOM8H_THRESHOLD 199 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 63 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 55 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 65 +#define SQR_TOOM4_THRESHOLD 166 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 309 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define MUL_FFT_MODF_THRESHOLD 276 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 276, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 21, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23, 8}, { 49, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 51,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 63, 8}, { 255,10}, { 71, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319,11}, \ + { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \ + { 31,11}, { 63, 9}, { 255, 8}, { 511,10}, \ + { 143, 8}, { 575,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415,10}, { 223, 9}, { 447, 8}, { 895,12}, \ + { 63,10}, { 255, 9}, { 511, 8}, { 1023, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575, 8}, \ + { 1151,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351, 9}, { 703,12}, { 95,10}, { 383, 9}, \ + { 767,11}, { 207, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 109 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 8, 5}, { 17, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 8}, { 287, 7}, { 575, 9}, \ + { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \ + { 575,11}, { 79,10}, { 159, 9}, { 319, 8}, \ + { 639, 9}, { 351,10}, { 191, 9}, { 383,10}, \ + { 207, 9}, { 415,10}, { 239,12}, { 63,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415, 9}, \ + { 831,11}, { 223,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 97 +#define SQR_FFT_THRESHOLD 2496 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 56 +#define SQRLO_SQR_THRESHOLD 4940 + +#define DC_DIV_QR_THRESHOLD 41 +#define DC_DIVAPPR_Q_THRESHOLD 136 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 89 + +#define INV_MULMOD_BNM1_THRESHOLD 22 +#define INV_NEWTON_THRESHOLD 154 +#define INV_APPR_THRESHOLD 141 + +#define BINV_NEWTON_THRESHOLD 182 +#define REDC_1_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 1078 +#define MUPI_DIV_QR_THRESHOLD 75 +#define MU_BDIV_QR_THRESHOLD 872 +#define MU_BDIV_Q_THRESHOLD 942 + +#define POWM_SEC_TABLE 1,19,117,539,1730 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 572 +#define SET_STR_PRECOMPUTE_THRESHOLD 1036 + +#define FAC_DSC_THRESHOLD 142 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD2_DIV1_METHOD 1 /* 8.83% faster than 3 */ +#define HGCD_THRESHOLD 80 +#define HGCD_APPR_THRESHOLD 70 +#define HGCD_REDUCE_THRESHOLD 1962 +#define GCD_DC_THRESHOLD 273 +#define GCDEXT_DC_THRESHOLD 198 +#define JACOBI_BASE_METHOD 1 /* 7.49% faster than 4 */ diff --git a/gmp-6.3.0/mpn/arm64/cora72/gmp-mparam.h b/gmp-6.3.0/mpn/arm64/cora72/gmp-mparam.h new file mode 100644 index 0000000..fc66fd3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/cora72/gmp-mparam.h @@ -0,0 +1,242 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file for a72. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1800 MHz Cortex-A72 */ +/* FFT tuning limit = 50,811,960 */ +/* Generated by tuneup.c, 2019-10-22, gcc 7.3 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 2 /* 12.09% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 26 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 13.42% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD 4 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 38 + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 8 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 153 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 104 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 16 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 206 +#define SQR_TOOM8_THRESHOLD 333 + +#define MULMID_TOOM42_THRESHOLD 18 + +#define MULMOD_BNM1_THRESHOLD 8 +#define SQRMOD_BNM1_THRESHOLD 10 + +#define MUL_FFT_MODF_THRESHOLD 268 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 268, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 15, 7}, { 13, 8}, { 7, 7}, { 16, 8}, \ + { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 15, 7}, { 31, 8}, \ + { 19, 9}, { 11, 8}, { 27,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 51,11}, { 15,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 79, 8}, { 159, 7}, { 319, 9}, \ + { 83,10}, { 47, 9}, { 95, 7}, { 383, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 63, 8}, \ + { 255, 7}, { 511, 9}, { 131,10}, { 71, 9}, \ + { 143, 8}, { 287, 7}, { 575, 6}, { 1151,10}, \ + { 79, 8}, { 319, 7}, { 639,10}, { 87, 8}, \ + { 351,11}, { 47,10}, { 95, 8}, { 383, 7}, \ + { 767,10}, { 103, 8}, { 415, 7}, { 831, 6}, \ + { 1663, 9}, { 223, 8}, { 447,12}, { 31,11}, \ + { 63, 9}, { 255, 8}, { 511, 7}, { 1023, 9}, \ + { 287, 8}, { 575, 7}, { 1151, 6}, { 2303, 7}, \ + { 1215,11}, { 79, 9}, { 319, 8}, { 639, 7}, \ + { 1279, 9}, { 351, 8}, { 703, 7}, { 1407, 6}, \ + { 2815, 9}, { 383, 8}, { 831, 7}, { 1663, 9}, \ + { 447, 8}, { 895, 7}, { 1791, 6}, { 3583, 8}, \ + { 959, 6}, { 3839, 5}, { 7679, 9}, { 511, 8}, \ + { 1023, 7}, { 2175, 9}, { 575, 8}, { 1151, 7}, \ + { 2303, 8}, { 1215,10}, { 351, 9}, { 703, 7}, \ + { 3071, 8}, { 1663, 9}, { 895, 8}, { 1791, 7}, \ + { 3583, 8}, { 1919, 6}, { 7679, 7}, { 3967, 9}, \ + { 1023,10}, { 575, 9}, { 1151, 8}, { 2559,10}, \ + { 703, 8}, { 2815, 9}, { 1471, 7}, { 5887,10}, \ + { 767,11}, { 415, 9}, { 1791, 8}, { 3583,11}, \ + { 479,10}, { 959, 8}, { 3967,11}, { 511, 9}, \ + { 2175,10}, { 1151, 8}, { 4607, 9}, { 2815,10}, \ + { 1471, 9}, { 2943,11}, { 767,10}, { 1535,11}, \ + { 831,10}, { 1791,11}, { 959,10}, { 1919, 9}, \ + { 3839, 8}, { 7679,10}, { 1983,12}, { 511,10}, \ + { 2047,11}, { 1215,12}, { 639,11}, { 1407,10}, \ + { 2815,11}, { 1471,12}, { 767,11}, { 1663,12}, \ + { 895,11}, { 1791,12}, { 959,11}, { 1919,10}, \ + { 3839,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1215,13}, { 639,12}, { 1279,13}, \ + { 703,12}, { 1407,11}, { 2815,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \ + { 1791,11}, { 3583,13}, { 959,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1215,12}, { 2431,14}, { 639,13}, { 1407,12}, \ + { 2815,13}, { 1471,12}, { 2943,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1791,12}, { 3583,13}, \ + { 1919,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,14}, { 1279,13}, { 2559,15}, { 767,14}, \ + { 1791,13}, { 3839,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,15}, { 1535,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 218 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 236 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 236, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95, 8}, { 191, 7}, { 383,10}, { 55,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255, 7}, \ + { 511,10}, { 71, 9}, { 143, 8}, { 287, 7}, \ + { 575,10}, { 79, 8}, { 319, 7}, { 639,11}, \ + { 47,10}, { 95, 8}, { 383, 7}, { 767, 8}, \ + { 415,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 543, 9}, { 287, 8}, { 575, 7}, \ + { 1151, 9}, { 319, 8}, { 639, 9}, { 351, 8}, \ + { 703, 7}, { 1407, 6}, { 2815,10}, { 191, 9}, \ + { 383, 8}, { 767, 9}, { 415, 8}, { 831, 7}, \ + { 1663,10}, { 223, 9}, { 447, 8}, { 895, 7}, \ + { 1791, 9}, { 479, 8}, { 959,12}, { 63,11}, \ + { 127, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \ + { 575, 8}, { 1151,10}, { 319, 9}, { 639,10}, \ + { 351, 9}, { 703, 8}, { 1407, 7}, { 2815, 8}, \ + { 1471, 5}, { 11775, 9}, { 767, 8}, { 1535,10}, \ + { 415, 9}, { 895, 8}, { 1919, 6}, { 7679, 7}, \ + { 3967,11}, { 255,10}, { 543, 9}, { 1087, 8}, \ + { 2175,10}, { 575, 9}, { 1151, 8}, { 2431,10}, \ + { 639, 9}, { 1279,10}, { 703, 9}, { 1407, 8}, \ + { 2943,11}, { 383,10}, { 767,11}, { 447,10}, \ + { 895,11}, { 479,10}, { 959, 9}, { 1919, 8}, \ + { 3839,10}, { 1023, 9}, { 2175,10}, { 1215, 9}, \ + { 2431,11}, { 703, 9}, { 2815,10}, { 1471,11}, \ + { 767,10}, { 1663,11}, { 895,10}, { 1791,11}, \ + { 959, 9}, { 3839,12}, { 511,11}, { 1087,10}, \ + { 2175,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1279,12}, { 703,11}, { 1471,12}, { 767,11}, \ + { 1663,12}, { 895,11}, { 1919,10}, { 3839,13}, \ + { 511,12}, { 1087,11}, { 2175,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,13}, { 767,12}, { 1535,13}, { 831,12}, \ + { 1791,13}, { 1151,12}, { 2303,13}, { 1215,14}, \ + { 639,12}, { 2559,13}, { 1407,14}, { 767,12}, \ + { 3071,14}, { 895,13}, { 1919,12}, { 3839,14}, \ + { 1023,13}, { 2175,14}, { 1151,12}, { 4607,14}, \ + { 1279,13}, { 2559,14}, { 1407,13}, { 2943,15}, \ + { 767,14}, { 1663,13}, { 3583,14}, { 1919,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2943,15}, \ + { 1535,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 203 +#define SQR_FFT_THRESHOLD 2176 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 5240 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 45 +#define SQRLO_SQR_THRESHOLD 4265 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 108 +#define DC_BDIV_QR_THRESHOLD 36 +#define DC_BDIV_Q_THRESHOLD 71 + +#define INV_MULMOD_BNM1_THRESHOLD 14 +#define INV_NEWTON_THRESHOLD 132 +#define INV_APPR_THRESHOLD 124 + +#define BINV_NEWTON_THRESHOLD 199 +#define REDC_1_TO_REDC_N_THRESHOLD 34 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 61 +#define MU_BDIV_QR_THRESHOLD 734 +#define MU_BDIV_Q_THRESHOLD 942 + +#define POWM_SEC_TABLE 6,30,110,579,1730 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 458 +#define SET_STR_PRECOMPUTE_THRESHOLD 875 + +#define FAC_DSC_THRESHOLD 153 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD2_DIV1_METHOD 1 /* 8.41% faster than 3 */ +#define HGCD_THRESHOLD 81 +#define HGCD_APPR_THRESHOLD 80 +#define HGCD_REDUCE_THRESHOLD 1494 +#define GCD_DC_THRESHOLD 268 +#define GCDEXT_DC_THRESHOLD 189 +#define JACOBI_BASE_METHOD 1 /* 10.80% faster than 4 */ + +/* Tuneup completed successfully, took 96906 seconds */ diff --git a/gmp-6.3.0/mpn/arm64/cora73/gmp-mparam.h b/gmp-6.3.0/mpn/arm64/cora73/gmp-mparam.h new file mode 100644 index 0000000..7fc7f4e --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/cora73/gmp-mparam.h @@ -0,0 +1,225 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file for a73. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1800 MHz Cortex-A72 */ +/* FFT tuning limit = 48,820,337 */ +/* Generated by tuneup.c, 2019-10-22, gcc 7.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 1 /* 2.28% faster than 2 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 44 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 35.13% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD 5 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 33 + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 89 +#define MUL_TOOM6H_THRESHOLD 141 +#define MUL_TOOM8H_THRESHOLD 199 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 61 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 58 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 62 +#define SQR_TOOM4_THRESHOLD 166 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 309 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 8 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 276 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 276, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \ + { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \ + { 27,10}, { 15, 9}, { 43,10}, { 23, 9}, \ + { 51,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 131,10}, { 71, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319,11}, \ + { 47, 9}, { 191, 8}, { 383, 7}, { 767, 8}, \ + { 415,12}, { 31,11}, { 63, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287, 8}, { 575,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767,10}, { 207, 9}, { 415,10}, \ + { 223, 9}, { 447,12}, { 63,10}, { 255, 9}, \ + { 511, 8}, { 1023, 9}, { 543,11}, { 143, 9}, \ + { 575,10}, { 319, 9}, { 639,10}, { 351, 9}, \ + { 703,12}, { 95,11}, { 191,10}, { 383,11}, \ + { 207,10}, { 415,11}, { 223,10}, { 447, 9}, \ + { 895,13}, { 63,11}, { 255,10}, { 511,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,10}, \ + { 959,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 575,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 575,13}, { 319,12}, { 703,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 831,12}, \ + { 1663,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1151,14}, { 639,13}, { 1407,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2047,14}, { 1151,13}, { 2431,14}, \ + { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,15}, { 1279,14}, { 2815,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 185 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 17, 8}, \ + { 9, 7}, { 20, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 31, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 8}, { 255,10}, { 71, 9}, \ + { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \ + { 319,11}, { 47,10}, { 95, 9}, { 191, 8}, \ + { 383,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 287, 8}, { 575,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95, 9}, { 383, 8}, { 767,10}, \ + { 207, 9}, { 415,10}, { 223, 8}, { 895,10}, \ + { 239,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,11}, { 223,10}, { 479,11}, { 255,10}, \ + { 511,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 511,12}, { 287,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,11}, { 959,12}, { 511,11}, { 1023,12}, \ + { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \ + { 1279,13}, { 383,12}, { 831,13}, { 447,12}, \ + { 895,14}, { 255,13}, { 511,12}, { 1023,13}, \ + { 703,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 895,15}, { 255,14}, { 511,13}, { 1151,14}, \ + { 639,13}, { 1407,14}, { 767,13}, { 1535,14}, \ + { 895,15}, { 511,14}, { 1151,13}, { 2431,14}, \ + { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,15}, { 1279,14}, { 2815,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 165 +#define SQR_FFT_THRESHOLD 2496 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 56 +#define SQRLO_SQR_THRESHOLD 4940 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 136 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 149 +#define INV_APPR_THRESHOLD 139 + +#define BINV_NEWTON_THRESHOLD 166 +#define REDC_1_TO_REDC_N_THRESHOLD 38 + +#define MU_DIV_QR_THRESHOLD 1120 +#define MU_DIVAPPR_Q_THRESHOLD 1078 +#define MUPI_DIV_QR_THRESHOLD 68 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 942 + +#define POWM_SEC_TABLE 4,22,102,473,1730 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 22 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1042 + +#define FAC_DSC_THRESHOLD 140 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD2_DIV1_METHOD 1 /* 7.84% faster than 3 */ +#define HGCD_THRESHOLD 80 +#define HGCD_APPR_THRESHOLD 80 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 273 +#define GCDEXT_DC_THRESHOLD 201 +#define JACOBI_BASE_METHOD 1 /* 1.03% faster than 4 */ + +/* Tuneup completed successfully, took 64972 seconds */ diff --git a/gmp-6.3.0/mpn/arm64/darwin.m4 b/gmp-6.3.0/mpn/arm64/darwin.m4 new file mode 100644 index 0000000..36e72fe --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/darwin.m4 @@ -0,0 +1,50 @@ +divert(-1) + +dnl m4 macros for ARM64 Darwin assembler. + +dnl Copyright 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Standard commenting is with @, the default m4 # is for constants and we +dnl don't want to disable macro expansions in or after them. + +changecom + + +dnl LEA_HI(reg,gmp_symbol), LEA_LO(reg,gmp_symbol) +dnl +dnl Load the address of gmp_symbol into a register. We split this into two +dnl parts to allow separation for manual insn scheduling. TODO: Darwin allows +dnl for relaxing these two insns into an adr and a nop, but that requires the +dnl .loh pseudo for connecting them. + +define(`LEA_HI',`adrp $1, $2@GOTPAGE')dnl +define(`LEA_LO',`ldr $1, [$1, $2@GOTPAGEOFF]')dnl + +divert`'dnl diff --git a/gmp-6.3.0/mpn/arm64/divrem_1.asm b/gmp-6.3.0/mpn/arm64/divrem_1.asm new file mode 100644 index 0000000..9d5bb59 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/divrem_1.asm @@ -0,0 +1,231 @@ +dnl ARM64 mpn_divrem_1 and mpn_preinv_divrem_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl TODO +dnl * Handle the most significant quotient limb for the unnormalised case +dnl specially, just like in the C code. (It is very often 0.) + +define(`qp_arg', x0) +define(`fn_arg', x1) +define(`np_arg', x2) +define(`n_arg', x3) +define(`d_arg', x4) +define(`dinv_arg', x5) +define(`cnt_arg', x6) + +define(`qp', x19) +define(`np', x20) +define(`n', x21) +define(`d', x22) +define(`fn', x24) +define(`dinv', x0) +define(`cnt', x23) +define(`tnc', x8) + +dnl mp_limb_t +dnl mpn_divrem_1 (mp_ptr qp, mp_size_t fn, +dnl mp_srcptr np, mp_size_t n, +dnl mp_limb_t d_unnorm) + +dnl mp_limb_t +dnl mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, +dnl mp_srcptr np, mp_size_t n, +dnl mp_limb_t d_unnorm, mp_limb_t dinv, int cnt) + +ASM_START() + +PROLOGUE(mpn_preinv_divrem_1) + cbz n_arg, L(fz) + stp x29, x30, [sp, #-80]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + + sub n, n_arg, #1 + add x7, n, fn_arg + add np, np_arg, n, lsl #3 + add qp, qp_arg, x7, lsl #3 + mov fn, fn_arg + mov d, d_arg + mov dinv, dinv_arg + tbnz d_arg, #63, L(nentry) + mov cnt, cnt_arg + b L(uentry) +EPILOGUE() + +PROLOGUE(mpn_divrem_1) + cbz n_arg, L(fz) + stp x29, x30, [sp, #-80]! + mov x29, sp + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + + sub n, n_arg, #1 + add x7, n, fn_arg + add np, np_arg, n, lsl #3 + add qp, qp_arg, x7, lsl #3 + mov fn, fn_arg + mov d, d_arg + tbnz d_arg, #63, L(normalised) + +L(unnorm): + clz cnt, d + lsl x0, d, cnt + bl GSYM_PREFIX`'MPN(invert_limb) +L(uentry): + lsl d, d, cnt + ldr x7, [np], #-8 + sub tnc, xzr, cnt + lsr x11, x7, tnc C r + lsl x1, x7, cnt + cbz n, L(uend) + +L(utop):ldr x7, [np], #-8 + add x2, x11, #1 + mul x10, x11, dinv + umulh x17, x11, dinv + lsr x9, x7, tnc + orr x1, x1, x9 + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, d, x2, x1 + lsl x1, x7, cnt + cmp x10, x11 + add x14, x11, d + csel x11, x14, x11, cc + sbc x2, x2, xzr + cmp x11, d + bcs L(ufx) +L(uok): str x2, [qp], #-8 + sub n, n, #1 + cbnz n, L(utop) + +L(uend):add x2, x11, #1 + mul x10, x11, dinv + umulh x17, x11, dinv + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, d, x2, x1 + cmp x10, x11 + add x14, x11, d + csel x11, x14, x11, cc + sbc x2, x2, xzr + subs x14, x11, d + adc x2, x2, xzr + csel x11, x14, x11, cs + str x2, [qp], #-8 + + cbnz fn, L(ftop) + lsr x0, x11, cnt + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +L(ufx): add x2, x2, #1 + sub x11, x11, d + b L(uok) + + +L(normalised): + mov x0, d + bl GSYM_PREFIX`'MPN(invert_limb) +L(nentry): + ldr x7, [np], #-8 + subs x14, x7, d + adc x2, xzr, xzr C hi q limb + csel x11, x14, x7, cs + b L(nok) + +L(ntop):ldr x1, [np], #-8 + add x2, x11, #1 + mul x10, x11, dinv + umulh x17, x11, dinv + adds x10, x1, x10 + adc x2, x2, x17 + msub x11, d, x2, x1 + cmp x10, x11 + add x14, x11, d + csel x11, x14, x11, cc C remainder + sbc x2, x2, xzr + cmp x11, d + bcs L(nfx) +L(nok): str x2, [qp], #-8 + sub n, n, #1 + tbz n, #63, L(ntop) + +L(nend):cbnz fn, L(frac) + mov x0, x11 + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +L(nfx): add x2, x2, #1 + sub x11, x11, d + b L(nok) + +L(frac):mov cnt, #0 +L(ftop):add x2, x11, #1 + mul x10, x11, dinv + umulh x17, x11, dinv + add x2, x2, x17 + msub x11, d, x2, xzr + cmp x10, x11 + add x14, x11, d + csel x11, x14, x11, cc C remainder + sbc x2, x2, xzr + str x2, [qp], #-8 + sub fn, fn, #1 + cbnz fn, L(ftop) + + lsr x0, x11, cnt + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x29, x30, [sp], #80 + ret + +C Block zero. We need this for the degenerated case of n = 0, fn != 0. +L(fz): cbz fn_arg, L(zend) +L(ztop):str xzr, [qp_arg], #8 + sub fn_arg, fn_arg, #1 + cbnz fn_arg, L(ztop) +L(zend):mov x0, #0 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/gcd_11.asm b/gmp-6.3.0/mpn/arm64/gcd_11.asm new file mode 100644 index 0000000..d8cc3e2 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/gcd_11.asm @@ -0,0 +1,70 @@ +dnl ARM v8a mpn_gcd_11. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +changecom(blah) + +C cycles/bit (approx) +C Cortex-A35 ? +C Cortex-A53 ? +C Cortex-A55 ? +C Cortex-A57 ? +C Cortex-A72 ? +C Cortex-A73 ? +C Cortex-A75 ? +C Cortex-A76 ? +C Cortex-A77 ? +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +define(`u0', `x0') +define(`v0', `x1') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + subs x3, u0, v0 C 0 + b.eq L(end) C + + ALIGN(16) +L(top): rbit x12, x3 C 1,5 + clz x12, x12 C 2 + csneg x3, x3, x3, cs C v = abs(u-v), even 1 + csel u0, v0, u0, cs C u = min(u,v) 1 + lsr v0, x3, x12 C 3 + subs x3, u0, v0 C 4 + b.ne L(top) C + +L(end): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/gcd_22.asm b/gmp-6.3.0/mpn/arm64/gcd_22.asm new file mode 100644 index 0000000..5367fea --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/gcd_22.asm @@ -0,0 +1,112 @@ +dnl ARM v8a mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +changecom(blah) + +C cycles/bit (approx) +C Cortex-A35 ? +C Cortex-A53 7.26 +C Cortex-A55 ? +C Cortex-A57 ? +C Cortex-A72 5.72 +C Cortex-A73 6.43 +C Cortex-A75 ? +C Cortex-A76 ? +C Cortex-A77 ? + + +define(`u1', `x0') +define(`u0', `x1') +define(`v1', `x2') +define(`v0', `x3') + +define(`t0', `x5') +define(`t1', `x6') +define(`cnt', `x7') +define(`tnc', `x8') + +ASM_START() +PROLOGUE(mpn_gcd_22) + + ALIGN(16) +L(top): subs t0, u0, v0 C 0 6 + cbz t0, L(lowz) + sbcs t1, u1, v1 C 1 7 + + rbit cnt, t0 C 1 + + cneg t0, t0, cc C 2 + cinv t1, t1, cc C 2 u = |u - v| +L(bck): csel v0, v0, u0, cs C 2 + csel v1, v1, u1, cs C 2 v = min(u,v) + + clz cnt, cnt C 2 + sub tnc, xzr, cnt C 3 + + lsr u0, t0, cnt C 3 + lsl x14, t1, tnc C 4 + lsr u1, t1, cnt C 3 + orr u0, u0, x14 C 5 + + orr x11, u1, v1 + cbnz x11, L(top) + + + subs x4, u0, v0 C 0 + b.eq L(end1) C + + ALIGN(16) +L(top1):rbit x12, x4 C 1,5 + clz x12, x12 C 2 + csneg x4, x4, x4, cs C v = abs(u-v), even 1 + csel u0, v0, u0, cs C u = min(u,v) 1 + lsr v0, x4, x12 C 3 + subs x4, u0, v0 C 4 + b.ne L(top1) C +L(end1):mov x0, u0 + mov x1, #0 + ret + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subs t0, u1, v1 + b.eq L(end) + mov t1, #0 + rbit cnt, t0 C 1 + cneg t0, t0, cc C 2 + b L(bck) C FIXME: make conditional + +L(end): mov x0, v0 + mov x1, v1 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/gmp-mparam.h b/gmp-6.3.0/mpn/arm64/gmp-mparam.h new file mode 100644 index 0000000..7c0c193 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/gmp-mparam.h @@ -0,0 +1,192 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1536 MHz Cortex-A53 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-09-29, gcc 5.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 2 /* 2.08% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 38.26% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 13 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 40 + +#define DIV_1_VS_MUL_1_PERCENT 159 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 49 +#define MUL_TOOM44_THRESHOLD 82 +#define MUL_TOOM6H_THRESHOLD 173 +#define MUL_TOOM8H_THRESHOLD 236 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 74 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 67 +#define SQR_TOOM4_THRESHOLD 166 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 333 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 10 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 316, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 17, 8}, \ + { 9, 7}, { 20, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 99,10}, { 55,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \ + { 131,10}, { 71, 8}, { 287,10}, { 79, 9}, \ + { 159, 8}, { 319,10}, { 87,11}, { 47,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 103, 9}, \ + { 207, 8}, { 415,10}, { 111, 9}, { 223,12}, \ + { 31,11}, { 63, 9}, { 255, 8}, { 511,10}, \ + { 135, 9}, { 287, 8}, { 575,11}, { 79,10}, \ + { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \ + { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767,10}, { 207, 9}, { 415,11}, \ + { 111,10}, { 223, 9}, { 447,12}, { 63,10}, \ + { 255, 9}, { 511, 8}, { 1023, 9}, { 543,10}, \ + { 287, 9}, { 575, 8}, { 1151,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351, 9}, \ + { 703, 8}, { 1407,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 118 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 272 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 272, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 8, 5}, { 17, 6}, { 17, 7}, { 17, 8}, \ + { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 15, 7}, { 31, 8}, \ + { 19, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95, 8}, { 191,10}, { 55,11}, { 31,10}, \ + { 63, 8}, { 255,10}, { 71, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159,11}, { 47,10}, \ + { 95, 9}, { 191, 8}, { 383, 7}, { 767,10}, \ + { 103, 9}, { 207,12}, { 31,11}, { 63, 9}, \ + { 255, 8}, { 511, 7}, { 1023, 9}, { 271,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415, 8}, { 831,10}, \ + { 223,12}, { 63,10}, { 255, 9}, { 511, 8}, \ + { 1023,10}, { 271,11}, { 143,10}, { 287, 9}, \ + { 575, 8}, { 1151,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 175,10}, { 351, 9}, { 703,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 112 +#define SQR_FFT_THRESHOLD 2688 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 38 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 67 +#define SQRLO_SQR_THRESHOLD 5240 + +#define DC_DIV_QR_THRESHOLD 42 +#define DC_DIVAPPR_Q_THRESHOLD 152 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 93 + +#define INV_MULMOD_BNM1_THRESHOLD 37 +#define INV_NEWTON_THRESHOLD 163 +#define INV_APPR_THRESHOLD 162 + +#define BINV_NEWTON_THRESHOLD 194 +#define REDC_1_TO_REDC_N_THRESHOLD 43 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 998 +#define MUPI_DIV_QR_THRESHOLD 98 +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 924 + +#define POWM_SEC_TABLE 6,30,194,579,1730 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 788 +#define SET_STR_PRECOMPUTE_THRESHOLD 1816 + +#define FAC_DSC_THRESHOLD 236 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 10 +#define HGCD2_DIV1_METHOD 1 /* 7.05% faster than 3 */ +#define HGCD_THRESHOLD 101 +#define HGCD_APPR_THRESHOLD 104 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 330 +#define GCDEXT_DC_THRESHOLD 242 +#define JACOBI_BASE_METHOD 4 /* 20.00% faster than 1 */ diff --git a/gmp-6.3.0/mpn/arm64/hamdist.asm b/gmp-6.3.0/mpn/arm64/hamdist.asm new file mode 100644 index 0000000..c72ca55 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/hamdist.asm @@ -0,0 +1,181 @@ +dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance. + +dnl Copyright 2013, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 4.5 +C Cortex-A57 1.9 +C X-Gene 4.36 + +C TODO +C * Consider greater unrolling. +C * Arrange to align the pointer, if that helps performance. Use the same +C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry +C valgrind!) +C * Explore if explicit align directives, e.g., "[ptr:128]" help. +C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. + +changecom(blah) + +C INPUT PARAMETERS +define(`ap', x0) +define(`bp', x1) +define(`n', x2) + +C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end +C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or +C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which +C allows the huge count code to jump deep into the code (at L(chu)). + +define(`maxsize', 0x1fff) +define(`chunksize',0x1ff0) + +ASM_START() +PROLOGUE(mpn_hamdist) + + mov x11, #maxsize + cmp n, x11 + b.hi L(gt8k) + +L(lt8k): + movi v4.16b, #0 C clear summation register + movi v5.16b, #0 C clear summation register + + tbz n, #0, L(xx0) + sub n, n, #1 + ld1 {v0.1d}, [ap], #8 C load 1 limb + ld1 {v16.1d}, [bp], #8 C load 1 limb + eor v0.16b, v0.16b, v16.16b + cnt v6.16b, v0.16b + uadalp v4.8h, v6.16b C could also splat + +L(xx0): tbz n, #1, L(x00) + sub n, n, #2 + ld1 {v0.2d}, [ap], #16 C load 2 limbs + ld1 {v16.2d}, [bp], #16 C load 2 limbs + eor v0.16b, v0.16b, v16.16b + cnt v6.16b, v0.16b + uadalp v4.8h, v6.16b + +L(x00): tbz n, #2, L(000) + subs n, n, #4 + ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs + ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs + b.ls L(sum) + +L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs + ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + sub n, n, #4 + cnt v6.16b, v0.16b + cnt v7.16b, v1.16b + b L(mid) + +L(000): subs n, n, #8 + b.lo L(e0) + +L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs + ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs + ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs + ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + cnt v6.16b, v2.16b + cnt v7.16b, v3.16b + subs n, n, #8 + b.lo L(end) + +L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs + ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + uadalp v4.8h, v6.16b + cnt v6.16b, v0.16b + uadalp v5.8h, v7.16b + cnt v7.16b, v1.16b +L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs + ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + subs n, n, #8 + uadalp v4.8h, v6.16b + cnt v6.16b, v2.16b + uadalp v5.8h, v7.16b + cnt v7.16b, v3.16b + b.hs L(top) + +L(end): uadalp v4.8h, v6.16b + uadalp v5.8h, v7.16b +L(sum): eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + cnt v6.16b, v0.16b + cnt v7.16b, v1.16b + uadalp v4.8h, v6.16b + uadalp v5.8h, v7.16b + add v4.8h, v4.8h, v5.8h + C we have 8 16-bit counts +L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts + uaddlp v4.2d, v4.4s C we have 2 64-bit counts + mov x0, v4.d[0] + mov x1, v4.d[1] + add x0, x0, x1 + ret + +C Code for count > maxsize. Splits operand and calls above code. +define(`ap2', x5) C caller-saves reg not used above +define(`bp2', x6) C caller-saves reg not used above +L(gt8k): + mov x8, x30 + mov x7, n C full count (caller-saves reg not used above) + mov x4, #0 C total sum (caller-saves reg not used above) + mov x9, #chunksize*8 C caller-saves reg not used above + mov x10, #chunksize C caller-saves reg not used above + +1: add ap2, ap, x9 C point at subsequent block + add bp2, bp, x9 C point at subsequent block + mov n, #chunksize-8 C count for this invocation, adjusted for entry pt + movi v4.16b, #0 C clear chunk summation register + movi v5.16b, #0 C clear chunk summation register + bl L(chu) C jump deep inside code + add x4, x4, x0 + mov ap, ap2 C put chunk pointer in place for calls + mov bp, bp2 C put chunk pointer in place for calls + sub x7, x7, x10 + cmp x7, x11 + b.hi 1b + + mov n, x7 C count for final invocation + bl L(lt8k) + add x0, x4, x0 + mov x30, x8 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/invert_limb.asm b/gmp-6.3.0/mpn/arm64/invert_limb.asm new file mode 100644 index 0000000..6a99bf0 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/invert_limb.asm @@ -0,0 +1,83 @@ +dnl ARM64 mpn_invert_limb -- Invert a normalized limb. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 ? +C Cortex-A57 ? + +C Compiler generated, mildly edited. Could surely be further optimised. + +ASM_START() +PROLOGUE(mpn_invert_limb) + lsr x2, x0, #54 + LEA_HI( x1, approx_tab) + and x2, x2, #0x1fe + LEA_LO( x1, approx_tab) + ldrh w3, [x1,x2] + lsr x4, x0, #24 + add x4, x4, #1 + ubfiz x2, x3, #11, #16 + umull x3, w3, w3 + mul x3, x3, x4 + sub x2, x2, #1 + sub x2, x2, x3, lsr #40 + lsl x3, x2, #60 + mul x1, x2, x2 + msub x1, x1, x4, x3 + lsl x2, x2, #13 + add x1, x2, x1, lsr #47 + and x2, x0, #1 + neg x3, x2 + and x3, x3, x1, lsr #1 + add x2, x2, x0, lsr #1 + msub x2, x1, x2, x3 + umulh x2, x2, x1 + lsl x1, x1, #31 + add x1, x1, x2, lsr #1 + mul x3, x1, x0 + umulh x2, x1, x0 + adds x4, x3, x0 + adc x0, x2, x0 + sub x0, x1, x0 + ret +EPILOGUE() + + RODATA + ALIGN(2) + TYPE( approx_tab, object) + SIZE( approx_tab, 512) +approx_tab: +forloop(i,256,512-1,dnl +` .hword eval(0x7fd00/i) +')dnl diff --git a/gmp-6.3.0/mpn/arm64/logops_n.asm b/gmp-6.3.0/mpn/arm64/logops_n.asm new file mode 100644 index 0000000..e959abc --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/logops_n.asm @@ -0,0 +1,139 @@ +dnl ARM64 mpn_and_n, mpn_andn_n. mpn_nand_n, etc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C nand,nior all other +C Cortex-A53 3.25-3.5 2.75-3 +C Cortex-A57 2.0 1.5 +C X-Gene 2.14 2.0 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`vp', `x2') +define(`n', `x3') + +define(`POSTOP', `dnl') + +ifdef(`OPERATION_and_n',` + define(`func', `mpn_and_n') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_andn_n',` + define(`func', `mpn_andn_n') + define(`LOGOP', `bic $1, $2, $3')') +ifdef(`OPERATION_nand_n',` + define(`func', `mpn_nand_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_ior_n',` + define(`func', `mpn_ior_n') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_iorn_n',` + define(`func', `mpn_iorn_n') + define(`LOGOP', `orn $1, $2, $3')') +ifdef(`OPERATION_nior_n',` + define(`func', `mpn_nior_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_xor_n',` + define(`func', `mpn_xor_n') + define(`LOGOP', `eor $1, $2, $3')') +ifdef(`OPERATION_xnor_n',` + define(`func', `mpn_xnor_n') + define(`LOGOP', `eon $1, $2, $3')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + lsr x17, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x7, [up] + ldr x11, [vp] + LOGOP( x15, x7, x11) + POSTOP( x15) + str x15, [rp],#8 + tbnz n, #1, L(b11) + +L(b01): cbz x17, L(ret) + ldp x4, x5, [up,#8] + ldp x8, x9, [vp,#8] + sub up, up, #8 + sub vp, vp, #8 + b L(mid) + +L(b11): ldp x6, x7, [up,#8] + ldp x10, x11, [vp,#8] + add up, up, #8 + add vp, vp, #8 + cbz x17, L(end) + b L(top) + +L(bx0): tbnz n, #1, L(b10) + +L(b00): ldp x4, x5, [up],#-16 + ldp x8, x9, [vp],#-16 + b L(mid) + +L(b10): ldp x6, x7, [up] + ldp x10, x11, [vp] + cbz x17, L(end) + + ALIGN(16) +L(top): ldp x4, x5, [up,#16] + ldp x8, x9, [vp,#16] + LOGOP( x12, x6, x10) + LOGOP( x13, x7, x11) + POSTOP( x12) + POSTOP( x13) + stp x12, x13, [rp],#16 +L(mid): ldp x6, x7, [up,#32]! + ldp x10, x11, [vp,#32]! + LOGOP( x12, x4, x8) + LOGOP( x13, x5, x9) + POSTOP( x12) + POSTOP( x13) + stp x12, x13, [rp],#16 + sub x17, x17, #1 + cbnz x17, L(top) + +L(end): LOGOP( x12, x6, x10) + LOGOP( x13, x7, x11) + POSTOP( x12) + POSTOP( x13) + stp x12, x13, [rp] +L(ret): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/lshift.asm b/gmp-6.3.0/mpn/arm64/lshift.asm new file mode 100644 index 0000000..fe8a1aa --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/lshift.asm @@ -0,0 +1,138 @@ +dnl ARM64 mpn_lshift. + +dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb assumed optimal c/l +C Cortex-A53 3.5-4.0 3.25 +C Cortex-A57 2.0 2.0 +C X-Gene 2.67 2.5 + +C TODO +C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These +C numbers should be 1 and 0, respectively. The str in wind-down should also +C go. +C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. +C * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0. + +changecom(blah) + +define(`rp_arg', `x0') +define(`up', `x1') +define(`n', `x2') +define(`cnt', `x3') + +define(`rp', `x16') + +define(`tnc',`x8') + +define(`PSHIFT', lsl) +define(`NSHIFT', lsr) + +ASM_START() +PROLOGUE(mpn_lshift) + add rp, rp_arg, n, lsl #3 + add up, up, n, lsl #3 + sub tnc, xzr, cnt + lsr x17, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x4, [up,#-8] + tbnz n, #1, L(b11) + +L(b01): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt + cbnz x17, L(gt1) + str x2, [rp,#-8] + ret +L(gt1): ldp x4, x5, [up,#-24] + sub up, up, #8 + add rp, rp, #16 + b L(lo2) + +L(b11): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt + ldp x6, x7, [up,#-24]! + b L(lo3) + +L(bx0): ldp x4, x5, [up,#-16] + tbz n, #1, L(b00) + +L(b10): NSHIFT x0, x5, tnc + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt + cbnz x17, L(gt2) + orr x10, x10, x13 + stp x2, x10, [rp,#-16] + ret +L(gt2): ldp x4, x5, [up,#-32] + orr x10, x10, x13 + str x10, [rp,#-8] + sub up, up, #16 + add rp, rp, #8 + b L(lo2) + +L(b00): NSHIFT x0, x5, tnc + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt + ldp x6, x7, [up,#-32]! + orr x10, x10, x13 + str x10, [rp,#-8]! + b L(lo0) + + ALIGN(16) +L(top): ldp x4, x5, [up,#-16] + orr x10, x10, x13 + orr x11, x12, x2 + stp x10, x11, [rp,#-16] + PSHIFT x2, x6, cnt +L(lo2): NSHIFT x10, x4, tnc + PSHIFT x13, x5, cnt + NSHIFT x12, x5, tnc + ldp x6, x7, [up,#-32]! + orr x10, x10, x13 + orr x11, x12, x2 + stp x10, x11, [rp,#-32]! + PSHIFT x2, x4, cnt +L(lo0): sub x17, x17, #1 +L(lo3): NSHIFT x10, x6, tnc + PSHIFT x13, x7, cnt + NSHIFT x12, x7, tnc + cbnz x17, L(top) + +L(end): orr x10, x10, x13 + orr x11, x12, x2 + PSHIFT x2, x6, cnt + stp x10, x11, [rp,#-16] + str x2, [rp,#-24] + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/lshiftc.asm b/gmp-6.3.0/mpn/arm64/lshiftc.asm new file mode 100644 index 0000000..6bf5844 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/lshiftc.asm @@ -0,0 +1,141 @@ +dnl ARM64 mpn_lshiftc. + +dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb assumed optimal c/l +C Cortex-A53 3.5-4.0 3.25 +C Cortex-A57 2.0 2.0 +C X-Gene 2.67 2.5 + +C TODO +C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These +C numbers should be 1 and 0, respectively. The str in wind-down should also +C go. +C * Using extr and with 63 separate loops we might reach 1.5 c/l on A57. +C * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0. + +changecom(blah) + +define(`rp_arg', `x0') +define(`up', `x1') +define(`n', `x2') +define(`cnt', `x3') + +define(`rp', `x16') + +define(`tnc',`x8') + +define(`PSHIFT', lsl) +define(`NSHIFT', lsr) + +ASM_START() +PROLOGUE(mpn_lshiftc) + add rp, rp_arg, n, lsl #3 + add up, up, n, lsl #3 + sub tnc, xzr, cnt + lsr x17, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x4, [up,#-8] + tbnz n, #1, L(b11) + +L(b01): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt + cbnz x17, L(gt1) + mvn x2, x2 + str x2, [rp,#-8] + ret +L(gt1): ldp x4, x5, [up,#-24] + sub up, up, #8 + add rp, rp, #16 + b L(lo2) + +L(b11): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt + ldp x6, x7, [up,#-24]! + b L(lo3) + +L(bx0): ldp x4, x5, [up,#-16] + tbz n, #1, L(b00) + +L(b10): NSHIFT x0, x5, tnc + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt + cbnz x17, L(gt2) + eon x10, x10, x13 + mvn x2, x2 + stp x2, x10, [rp,#-16] + ret +L(gt2): ldp x4, x5, [up,#-32] + eon x10, x10, x13 + str x10, [rp,#-8] + sub up, up, #16 + add rp, rp, #8 + b L(lo2) + +L(b00): NSHIFT x0, x5, tnc + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt + ldp x6, x7, [up,#-32]! + eon x10, x10, x13 + str x10, [rp,#-8]! + b L(lo0) + + ALIGN(16) +L(top): ldp x4, x5, [up,#-16] + eon x10, x10, x13 + eon x11, x12, x2 + stp x10, x11, [rp,#-16] + PSHIFT x2, x6, cnt +L(lo2): NSHIFT x10, x4, tnc + PSHIFT x13, x5, cnt + NSHIFT x12, x5, tnc + ldp x6, x7, [up,#-32]! + eon x10, x10, x13 + eon x11, x12, x2 + stp x10, x11, [rp,#-32]! + PSHIFT x2, x4, cnt +L(lo0): sub x17, x17, #1 +L(lo3): NSHIFT x10, x6, tnc + PSHIFT x13, x7, cnt + NSHIFT x12, x7, tnc + cbnz x17, L(top) + +L(end): eon x10, x10, x13 + eon x11, x12, x2 + PSHIFT x2, x6, cnt + stp x10, x11, [rp,#-16] + mvn x2, x2 + str x2, [rp,#-24] + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/mod_34lsub1.asm b/gmp-6.3.0/mpn/arm64/mod_34lsub1.asm new file mode 100644 index 0000000..7945fe7 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/mod_34lsub1.asm @@ -0,0 +1,124 @@ +dnl ARM64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. + +dnl Copyright 2012-2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 2 +C Cortex-A57 1 +C X-Gene 1.45 + +define(`ap', x0) +define(`n', x1) + +changecom(blah) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * An alternative inner loop which could run at 0.722 c/l on A57: +C adds x8, x8, x2 +C adcs x9, x9, x3 +C ldp x2, x3, [ap, #-32] +C adcs x10, x10, x4 +C adc x12, x12, xzr +C adds x8, x8, x5 +C ldp x4, x5, [ap, #-16] +C sub n, n, #6 +C adcs x9, x9, x6 +C adcs x10, x10, x7 +C ldp x6, x7, [ap], #48 +C adc x12, x12, xzr +C tbz n, #63, L(top) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + subs n, n, #3 + mov x8, #0 + b.lt L(le2) C n <= 2 + + ldp x2, x3, [ap, #0] + ldr x4, [ap, #16] + add ap, ap, #24 + subs n, n, #3 + b.lt L(sum) C n <= 5 + cmn x0, #0 C clear carry + +L(top): ldp x5, x6, [ap, #0] + ldr x7, [ap, #16] + add ap, ap, #24 + sub n, n, #3 + adcs x2, x2, x5 + adcs x3, x3, x6 + adcs x4, x4, x7 + tbz n, #63, L(top) + + adc x8, xzr, xzr C x8 <= 1 + +L(sum): cmn n, #2 + mov x5, #0 + b.lo 1f + ldr x5, [ap], #8 +1: mov x6, #0 + b.ls 1f + ldr x6, [ap], #8 +1: adds x2, x2, x5 + adcs x3, x3, x6 + adcs x4, x4, xzr + adc x8, x8, xzr C x8 <= 2 + +L(sum2): + and x0, x2, #0xffffffffffff + add x0, x0, x2, lsr #48 + add x0, x0, x8 + + lsl x8, x3, #16 + and x1, x8, #0xffffffffffff + add x0, x0, x1 + add x0, x0, x3, lsr #32 + + lsl x8, x4, #32 + and x1, x8, #0xffffffffffff + add x0, x0, x1 + add x0, x0, x4, lsr #16 + ret + +L(le2): cmn n, #1 + b.ne L(1) + ldp x2, x3, [ap] + mov x4, #0 + b L(sum2) +L(1): ldr x2, [ap] + and x0, x2, #0xffffffffffff + add x0, x0, x2, lsr #48 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/mul_1.asm b/gmp-6.3.0/mpn/arm64/mul_1.asm new file mode 100644 index 0000000..fb965ef --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/mul_1.asm @@ -0,0 +1,128 @@ +dnl ARM64 mpn_mul_1 + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013, 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 7.5-8 +C Cortex-A57 7 +C Cortex-A72 +C X-Gene 4 +C Apple M1 1 + +C TODO +C * Start first multiply earlier. + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`n', `x2') +define(`v0', `x3') + + +PROLOGUE(mpn_mul_1c) + adds xzr, xzr, xzr C clear cy flag + b L(com) +EPILOGUE() + +PROLOGUE(mpn_mul_1) + adds x4, xzr, xzr C clear register and cy flag +L(com): lsr x17, n, #2 + tbnz n, #0, L(bx1) + +L(bx0): mov x11, x4 + tbz n, #1, L(b00) + +L(b10): ldp x4, x5, [up] + mul x8, x4, v0 + umulh x10, x4, v0 + cbz x17, L(2) + ldp x6, x7, [up,#16]! + mul x9, x5, v0 + b L(mid)-8 + +L(2): mul x9, x5, v0 + b L(2e) + +L(bx1): ldr x7, [up],#8 + mul x9, x7, v0 + umulh x11, x7, v0 + adds x9, x9, x4 + str x9, [rp],#8 + tbnz n, #1, L(b10) + +L(b01): cbz x17, L(1) + +L(b00): ldp x6, x7, [up] + mul x8, x6, v0 + umulh x10, x6, v0 + ldp x4, x5, [up,#16] + mul x9, x7, v0 + adcs x12, x8, x11 + umulh x11, x7, v0 + add rp, rp, #16 + sub x17, x17, #1 + cbz x17, L(end) + + ALIGN(16) +L(top): mul x8, x4, v0 + ldp x6, x7, [up,#32]! + adcs x13, x9, x10 + umulh x10, x4, v0 + mul x9, x5, v0 + stp x12, x13, [rp,#-16] + adcs x12, x8, x11 + umulh x11, x5, v0 +L(mid): mul x8, x6, v0 + ldp x4, x5, [up,#16] + adcs x13, x9, x10 + umulh x10, x6, v0 + mul x9, x7, v0 + stp x12, x13, [rp],#32 + adcs x12, x8, x11 + umulh x11, x7, v0 + sub x17, x17, #1 + cbnz x17, L(top) + +L(end): mul x8, x4, v0 + adcs x13, x9, x10 + umulh x10, x4, v0 + mul x9, x5, v0 + stp x12, x13, [rp,#-16] +L(2e): adcs x12, x8, x11 + umulh x11, x5, v0 + adcs x13, x9, x10 + stp x12, x13, [rp] +L(1): adc x0, x11, xzr + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/popcount.asm b/gmp-6.3.0/mpn/arm64/popcount.asm new file mode 100644 index 0000000..74de3fc --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/popcount.asm @@ -0,0 +1,157 @@ +dnl ARM64 Neon mpn_popcount -- mpn bit population count. + +dnl Copyright 2013, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 2.5 +C Cortex-A57 1.14 +C X-Gene 3 + +C TODO +C * Consider greater unrolling. +C * Arrange to align the pointer, if that helps performance. Use the same +C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry +C valgrind!) +C * Explore if explicit align directives, e.g., "[ptr:128]" help. +C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. + +changecom(blah) + +C INPUT PARAMETERS +define(`ap', x0) +define(`n', x1) + +C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end +C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or +C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which +C allows the huge count code to jump deep into the code (at L(chu)). + +define(`maxsize', 0x1fff) +define(`chunksize',0x1ff0) + +ASM_START() +PROLOGUE(mpn_popcount) + + mov x11, #maxsize + cmp n, x11 + b.hi L(gt8k) + +L(lt8k): + movi v4.16b, #0 C clear summation register + movi v5.16b, #0 C clear summation register + + tbz n, #0, L(xx0) + sub n, n, #1 + ld1 {v0.1d}, [ap], #8 C load 1 limb + cnt v6.16b, v0.16b + uadalp v4.8h, v6.16b C could also splat + +L(xx0): tbz n, #1, L(x00) + sub n, n, #2 + ld1 {v0.2d}, [ap], #16 C load 2 limbs + cnt v6.16b, v0.16b + uadalp v4.8h, v6.16b + +L(x00): tbz n, #2, L(000) + subs n, n, #4 + ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs + b.ls L(sum) + +L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs + sub n, n, #4 + cnt v6.16b, v0.16b + cnt v7.16b, v1.16b + b L(mid) + +L(000): subs n, n, #8 + b.lo L(e0) + +L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs + ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs + cnt v6.16b, v2.16b + cnt v7.16b, v3.16b + subs n, n, #8 + b.lo L(end) + +L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs + uadalp v4.8h, v6.16b + cnt v6.16b, v0.16b + uadalp v5.8h, v7.16b + cnt v7.16b, v1.16b +L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs + subs n, n, #8 + uadalp v4.8h, v6.16b + cnt v6.16b, v2.16b + uadalp v5.8h, v7.16b + cnt v7.16b, v3.16b + b.hs L(top) + +L(end): uadalp v4.8h, v6.16b + uadalp v5.8h, v7.16b +L(sum): cnt v6.16b, v0.16b + cnt v7.16b, v1.16b + uadalp v4.8h, v6.16b + uadalp v5.8h, v7.16b + add v4.8h, v4.8h, v5.8h + C we have 8 16-bit counts +L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts + uaddlp v4.2d, v4.4s C we have 2 64-bit counts + mov x0, v4.d[0] + mov x1, v4.d[1] + add x0, x0, x1 + ret + +C Code for count > maxsize. Splits operand and calls above code. +define(`ap2', x5) C caller-saves reg not used above +L(gt8k): + mov x8, x30 + mov x7, n C full count (caller-saves reg not used above) + mov x4, #0 C total sum (caller-saves reg not used above) + mov x9, #chunksize*8 C caller-saves reg not used above + mov x10, #chunksize C caller-saves reg not used above + +1: add ap2, ap, x9 C point at subsequent block + mov n, #chunksize-8 C count for this invocation, adjusted for entry pt + movi v4.16b, #0 C clear chunk summation register + movi v5.16b, #0 C clear chunk summation register + bl L(chu) C jump deep inside code + add x4, x4, x0 + mov ap, ap2 C put chunk pointer in place for calls + sub x7, x7, x10 + cmp x7, x11 + b.hi 1b + + mov n, x7 C count for final invocation + bl L(lt8k) + add x0, x4, x0 + mov x30, x8 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/rsh1aors_n.asm b/gmp-6.3.0/mpn/arm64/rsh1aors_n.asm new file mode 100644 index 0000000..afd3d5b --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/rsh1aors_n.asm @@ -0,0 +1,168 @@ +dnl ARM64 mpn_rsh1add_n and mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb assumed optimal c/l +C Cortex-A53 3.25-3.75 3.0 steady +C Cortex-A57 2.15 1.75 +C X-Gene 2.75 2.5 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`vp', `x2') +define(`n', `x3') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`COND', `cs') + define(`func_n', mpn_rsh1add_n)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`COND', `cc') + define(`func_n', mpn_rsh1sub_n)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func_n) + lsr x6, n, #2 + + tbz n, #0, L(bx0) + +L(bx1): ldr x5, [up],#8 + ldr x9, [vp],#8 + tbnz n, #1, L(b11) + +L(b01): ADDSUB x13, x5, x9 + and x10, x13, #1 + cbz x6, L(1) + ldp x4, x5, [up],#48 + ldp x8, x9, [vp],#48 + ADDSUBC x14, x4, x8 + ADDSUBC x15, x5, x9 + ldp x4, x5, [up,#-32] + ldp x8, x9, [vp,#-32] + extr x17, x14, x13, #1 + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + str x17, [rp], #24 + sub x6, x6, #1 + cbz x6, L(end) + b L(top) + +L(1): cset x14, COND + extr x17, x14, x13, #1 + str x17, [rp] + mov x0, x10 + ret + +L(b11): ADDSUB x15, x5, x9 + and x10, x15, #1 + + ldp x4, x5, [up],#32 + ldp x8, x9, [vp],#32 + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + cbz x6, L(3) + ldp x4, x5, [up,#-16] + ldp x8, x9, [vp,#-16] + extr x17, x12, x15, #1 + ADDSUBC x14, x4, x8 + ADDSUBC x15, x5, x9 + str x17, [rp], #8 + b L(mid) + +L(3): extr x17, x12, x15, #1 + str x17, [rp], #8 + b L(2) + +L(bx0): tbz n, #1, L(b00) + +L(b10): ldp x4, x5, [up],#32 + ldp x8, x9, [vp],#32 + ADDSUB x12, x4, x8 + ADDSUBC x13, x5, x9 + and x10, x12, #1 + cbz x6, L(2) + ldp x4, x5, [up,#-16] + ldp x8, x9, [vp,#-16] + ADDSUBC x14, x4, x8 + ADDSUBC x15, x5, x9 + b L(mid) + +L(b00): ldp x4, x5, [up],#48 + ldp x8, x9, [vp],#48 + ADDSUB x14, x4, x8 + ADDSUBC x15, x5, x9 + and x10, x14, #1 + ldp x4, x5, [up,#-32] + ldp x8, x9, [vp,#-32] + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + add rp, rp, #16 + sub x6, x6, #1 + cbz x6, L(end) + + ALIGN(16) +L(top): ldp x4, x5, [up,#-16] + ldp x8, x9, [vp,#-16] + extr x16, x15, x14, #1 + extr x17, x12, x15, #1 + ADDSUBC x14, x4, x8 + ADDSUBC x15, x5, x9 + stp x16, x17, [rp,#-16] +L(mid): ldp x4, x5, [up],#32 + ldp x8, x9, [vp],#32 + extr x16, x13, x12, #1 + extr x17, x14, x13, #1 + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + stp x16, x17, [rp],#32 + sub x6, x6, #1 + cbnz x6, L(top) + +L(end): extr x16, x15, x14, #1 + extr x17, x12, x15, #1 + stp x16, x17, [rp,#-16] +L(2): cset x14, COND + extr x16, x13, x12, #1 + extr x17, x14, x13, #1 + stp x16, x17, [rp] + +L(ret): mov x0, x10 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/rshift.asm b/gmp-6.3.0/mpn/arm64/rshift.asm new file mode 100644 index 0000000..90187ad --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/rshift.asm @@ -0,0 +1,136 @@ +dnl ARM64 mpn_rshift. + +dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb assumed optimal c/l +C Cortex-A53 3.5-4.0 3.25 +C Cortex-A57 2.0 2.0 +C X-Gene 2.67 2.5 + +C TODO +C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These +C numbers should be 1 and 0, respectively. The str in wind-down should also +C go. +C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. +C * A53's speed depends on alignment, but not as simply as for lshift/lshiftc. + +changecom(blah) + +define(`rp_arg', `x0') +define(`up', `x1') +define(`n', `x2') +define(`cnt', `x3') + +define(`rp', `x16') + +define(`tnc',`x8') + +define(`PSHIFT', lsr) +define(`NSHIFT', lsl) + +ASM_START() +PROLOGUE(mpn_rshift) + mov rp, rp_arg + sub tnc, xzr, cnt + lsr x17, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x5, [up] + tbnz n, #1, L(b11) + +L(b01): NSHIFT x0, x5, tnc + PSHIFT x2, x5, cnt + cbnz x17, L(gt1) + str x2, [rp] + ret +L(gt1): ldp x4, x5, [up,#8] + sub up, up, #8 + sub rp, rp, #32 + b L(lo2) + +L(b11): NSHIFT x0, x5, tnc + PSHIFT x2, x5, cnt + ldp x6, x7, [up,#8]! + sub rp, rp, #16 + b L(lo3) + +L(bx0): ldp x4, x5, [up] + tbz n, #1, L(b00) + +L(b10): NSHIFT x0, x4, tnc + PSHIFT x13, x4, cnt + NSHIFT x10, x5, tnc + PSHIFT x2, x5, cnt + cbnz x17, L(gt2) + orr x10, x10, x13 + stp x10, x2, [rp] + ret +L(gt2): ldp x4, x5, [up,#16] + orr x10, x10, x13 + str x10, [rp],#-24 + b L(lo2) + +L(b00): NSHIFT x0, x4, tnc + PSHIFT x13, x4, cnt + NSHIFT x10, x5, tnc + PSHIFT x2, x5, cnt + ldp x6, x7, [up,#16]! + orr x10, x10, x13 + str x10, [rp],#-8 + b L(lo0) + + ALIGN(16) +L(top): ldp x4, x5, [up,#16] + orr x10, x10, x13 + orr x11, x12, x2 + stp x11, x10, [rp,#16] + PSHIFT x2, x7, cnt +L(lo2): NSHIFT x10, x5, tnc + NSHIFT x12, x4, tnc + PSHIFT x13, x4, cnt + ldp x6, x7, [up,#32]! + orr x10, x10, x13 + orr x11, x12, x2 + stp x11, x10, [rp,#32]! + PSHIFT x2, x5, cnt +L(lo0): sub x17, x17, #1 +L(lo3): NSHIFT x10, x7, tnc + NSHIFT x12, x6, tnc + PSHIFT x13, x6, cnt + cbnz x17, L(top) + +L(end): orr x10, x10, x13 + orr x11, x12, x2 + PSHIFT x2, x7, cnt + stp x11, x10, [rp,#16] + str x2, [rp,#32] + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/sec_tabselect.asm b/gmp-6.3.0/mpn/arm64/sec_tabselect.asm new file mode 100644 index 0000000..18a268a --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/sec_tabselect.asm @@ -0,0 +1,122 @@ +dnl ARM64 Neon mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Cortex-A53 2.25 +C Cortex-A57 1.33 +C X-Gene 2 + +C void +C mpn_sec_tabselect (mp_ptr rp, mp_srcptr *tab, +C mp_size_t n, mp_size_t nents, mp_size_t which) + +changecom(blah) + +define(`rp', `x0') +define(`tp', `x1') +define(`n', `x2') +define(`nents', `x3') +define(`which', `x4') + +define(`i', `x5') +define(`j', `x6') + +define(`maskq', `v4') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + dup v7.2d, x4 C 2 `which' copies + + mov x10, #1 + dup v6.2d, x10 C 2 copies of 1 + + subs j, n, #4 + b.mi L(outer_end) + +L(outer_top): + mov i, nents + mov x12, tp C preserve tp + movi v5.16b, #0 C zero 2 counter copies + movi v2.16b, #0 + movi v3.16b, #0 + ALIGN(16) +L(tp4): cmeq maskq.2d, v5.2d, v7.2d C compare idx copies to `which' copies + ld1 {v0.2d,v1.2d}, [tp] + add v5.2d, v5.2d, v6.2d + bit v2.16b, v0.16b, maskq.16b + bit v3.16b, v1.16b, maskq.16b + add tp, tp, n, lsl #3 + sub i, i, #1 + cbnz i, L(tp4) + st1 {v2.2d,v3.2d}, [rp], #32 + add tp, x12, #32 C restore tp, point to next slice + subs j, j, #4 + b.pl L(outer_top) +L(outer_end): + + tbz n, #1, L(b0x) + mov i, nents + mov x12, tp + movi v5.16b, #0 C zero 2 counter copies + movi v2.16b, #0 + ALIGN(16) +L(tp2): cmeq maskq.2d, v5.2d, v7.2d + ld1 {v0.2d}, [tp] + add v5.2d, v5.2d, v6.2d + bit v2.16b, v0.16b, maskq.16b + add tp, tp, n, lsl #3 + sub i, i, #1 + cbnz i, L(tp2) + st1 {v2.2d}, [rp], #16 + add tp, x12, #16 + +L(b0x): tbz n, #0, L(b00) + mov i, nents + mov x12, tp + movi v5.16b, #0 C zero 2 counter copies + movi v2.16b, #0 + ALIGN(16) +L(tp1): cmeq maskq.2d, v5.2d, v7.2d + ld1 {v0.1d}, [tp] + add v5.2d, v5.2d, v6.2d C FIXME size should be `1d' + bit v2.8b, v0.8b, maskq.8b + add tp, tp, n, lsl #3 + sub i, i, #1 + cbnz i, L(tp1) + st1 {v2.1d}, [rp], #8 + add tp, x12, #8 + +L(b00): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/arm64/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..39f1cb1 --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/sqr_diag_addlsh1.asm @@ -0,0 +1,102 @@ +dnl ARM64 mpn_sqr_diag_addlsh1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2016, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 5.65 +C Cortex-A57 3.5 +C X-Gene 3.38 + +changecom(blah) + +define(`rp', `x0') +define(`tp', `x1') +define(`up', `x2') +define(`n', `x3') + +ASM_START() +PROLOGUE(mpn_sqr_diag_addlsh1) + ldr x15, [up],#8 + lsr x14, n, #1 + tbz n, #0, L(bx0) + +L(bx1): adds x7, xzr, xzr + mul x12, x15, x15 + ldr x16, [up],#8 + ldp x4, x5, [tp],#16 + umulh x11, x15, x15 + b L(mid) + +L(bx0): adds x5, xzr, xzr + mul x12, x15, x15 + ldr x17, [up],#16 + ldp x6, x7, [tp],#32 + umulh x11, x15, x15 + sub x14, x14, #1 + cbz x14, L(end) + + ALIGN(16) +L(top): extr x9, x6, x5, #63 + mul x10, x17, x17 + ldr x16, [up,#-8] + adcs x13, x9, x11 + ldp x4, x5, [tp,#-16] + umulh x11, x17, x17 + extr x8, x7, x6, #63 + stp x12, x13, [rp],#16 + adcs x12, x8, x10 +L(mid): extr x9, x4, x7, #63 + mul x10, x16, x16 + ldr x17, [up],#16 + adcs x13, x9, x11 + ldp x6, x7, [tp],#32 + umulh x11, x16, x16 + extr x8, x5, x4, #63 + stp x12, x13, [rp],#16 + adcs x12, x8, x10 + sub x14, x14, #1 + cbnz x14, L(top) + +L(end): extr x9, x6, x5, #63 + mul x10, x17, x17 + adcs x13, x9, x11 + umulh x11, x17, x17 + extr x8, x7, x6, #63 + stp x12, x13, [rp] + adcs x12, x8, x10 + extr x9, xzr, x7, #63 + adcs x13, x9, x11 + stp x12, x13, [rp,#16] + + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm64/xgene1/gmp-mparam.h b/gmp-6.3.0/mpn/arm64/xgene1/gmp-mparam.h new file mode 100644 index 0000000..c8020bb --- /dev/null +++ b/gmp-6.3.0/mpn/arm64/xgene1/gmp-mparam.h @@ -0,0 +1,182 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2400 MHz AppliedMicro X-Gene */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-09-28, gcc 4.8 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 /* 2.00% faster than 2 */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 +/* From gcc185.osuosl.org, 2023-07-26 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 5.60% faster than 4 */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD 14 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 249 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 61 +#define MUL_TOOM44_THRESHOLD 112 +#define MUL_TOOM6H_THRESHOLD 242 +#define MUL_TOOM8H_THRESHOLD 321 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 109 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 81 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 214 +#define SQR_TOOM8_THRESHOLD 284 + +#define MULMID_TOOM42_THRESHOLD 46 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 412, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 19, 7}, { 12, 6}, { 25, 7}, { 17, 8}, \ + { 9, 7}, { 20, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 71, 9}, { 143,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135, 9}, { 271,10}, \ + { 143,11}, { 79, 9}, { 319,10}, { 167, 9}, \ + { 351,11}, { 95, 9}, { 383, 8}, { 767,10}, \ + { 207, 9}, { 415,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 319, 9}, { 639,10}, { 351,12}, { 95,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 98 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 19, 7}, { 10, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 135, 9}, { 271,11}, { 79, 9}, { 319, 8}, \ + { 639,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 87 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 45 +#define MULLO_MUL_N_THRESHOLD 8648 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 108 +#define SQRLO_SQR_THRESHOLD 6461 + +#define DC_DIV_QR_THRESHOLD 64 +#define DC_DIVAPPR_Q_THRESHOLD 222 +#define DC_BDIV_QR_THRESHOLD 63 +#define DC_BDIV_Q_THRESHOLD 132 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 242 +#define INV_APPR_THRESHOLD 222 + +#define BINV_NEWTON_THRESHOLD 254 +#define REDC_1_TO_REDC_N_THRESHOLD 66 + +#define MU_DIV_QR_THRESHOLD 1234 +#define MU_DIVAPPR_Q_THRESHOLD 1234 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1210 +#define MU_BDIV_Q_THRESHOLD 1234 + +#define POWM_SEC_TABLE 3,23,194,712,2499 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 22 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 2503 + +#define FAC_DSC_THRESHOLD 216 +#define FAC_ODD_THRESHOLD 26 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 5 /* 2.01% faster than 3 */ +#define HGCD_THRESHOLD 122 +#define HGCD_APPR_THRESHOLD 171 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 541 +#define GCDEXT_DC_THRESHOLD 386 +#define JACOBI_BASE_METHOD 4 /* 7.46% faster than 1 */ diff --git a/gmp-6.3.0/mpn/asm-defs.m4 b/gmp-6.3.0/mpn/asm-defs.m4 new file mode 100644 index 0000000..1f2d9fe --- /dev/null +++ b/gmp-6.3.0/mpn/asm-defs.m4 @@ -0,0 +1,1766 @@ +divert(-1) +dnl +dnl m4 macros for gmp assembly code, shared by all CPUs. + +dnl Copyright 1999-2006, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl These macros are designed for use with any m4 and have been used on +dnl GNU, FreeBSD, NetBSD, OpenBSD and SysV. +dnl +dnl GNU m4 and OpenBSD 2.7 m4 will give filenames and line numbers in error +dnl messages. +dnl +dnl +dnl Macros: +dnl +dnl Most new m4 specific macros have an "m4_" prefix to emphasise they're +dnl m4 expansions. But new defining things like deflit() and defreg() are +dnl named like the builtin define(), and forloop() is named following the +dnl GNU m4 example on which it's based. +dnl +dnl GNU m4 with the -P option uses "m4_" as a prefix for builtins, but that +dnl option isn't going to be used, so there's no conflict or confusion. +dnl +dnl +dnl Comments in output: +dnl +dnl The m4 comment delimiters are left at # and \n, the normal assembler +dnl commenting for most CPUs. m4 passes comment text through without +dnl expanding macros in it, which is generally a good thing since it stops +dnl unexpected expansions and possible resultant errors. +dnl +dnl But note that when a quoted string is being read, a # isn't special, so +dnl apostrophes in comments in quoted strings must be avoided or they'll be +dnl interpreted as a closing quote mark. But when the quoted text is +dnl re-read # will still act like a normal comment, suppressing macro +dnl expansion. +dnl +dnl For example, +dnl +dnl # apostrophes in comments that're outside quotes are ok +dnl # and using macro names like PROLOGUE is ok too +dnl ... +dnl ifdef(`PIC',` +dnl # but apostrophes aren't ok inside quotes +dnl # ^--wrong +dnl ... +dnl # though macro names like PROLOGUE are still ok +dnl ... +dnl ') +dnl +dnl If macro expansion in a comment is wanted, use `#' in the .asm (ie. a +dnl quoted hash symbol), which will turn into # in the .s but get +dnl expansions done on that line. This can make the .s more readable to +dnl humans, but it won't make a blind bit of difference to the assembler. +dnl +dnl All the above applies, mutatis mutandis, when changecom() is used to +dnl select @ ! ; or whatever other commenting. +dnl +dnl +dnl Variations in m4 affecting gmp: +dnl +dnl $# - When a macro is called as "foo" with no brackets, BSD m4 sets $# +dnl to 1, whereas GNU or SysV m4 set it to 0. In all cases though +dnl "foo()" sets $# to 1. This is worked around in various places. +dnl +dnl len() - When "len()" is given an empty argument, BSD m4 evaluates to +dnl nothing, whereas GNU, SysV, and the new OpenBSD, evaluate to 0. +dnl See m4_length() below which works around this. +dnl +dnl translit() - GNU m4 accepts character ranges like A-Z, and the new +dnl OpenBSD m4 does under option -g, but basic BSD and SysV don't. +dnl +dnl popdef() - in BSD and SysV m4 popdef() takes multiple arguments and +dnl pops each, but GNU m4 only takes one argument. +dnl +dnl push back - BSD m4 has some limits on the amount of text that can be +dnl pushed back. The limit is reasonably big and so long as macros +dnl don't gratuitously duplicate big arguments it isn't a problem. +dnl Normally an error message is given, but sometimes it just hangs. +dnl +dnl eval() &,|,^ - GNU and SysV m4 have bitwise operators &,|,^ available, +dnl but BSD m4 doesn't (contrary to what the man page suggests) and +dnl instead ^ is exponentiation. +dnl +dnl eval() ?: - The C ternary operator "?:" is available in BSD m4, but not +dnl in SysV or GNU m4 (as of GNU m4 1.4 and betas of 1.5). +dnl +dnl eval() -2^31 - BSD m4 has a bug where an eval() resulting in -2^31 +dnl (ie. -2147483648) gives "-(". Using -2147483648 within an +dnl expression is ok, it just can't be a final result. "-(" will of +dnl course upset parsing, with all sorts of strange effects. +dnl +dnl eval() <<,>> - SysV m4 doesn't support shift operators in eval() (on +dnl Solaris 7 /usr/xpg4/m4 has them but /usr/ccs/m4 doesn't). See +dnl m4_lshift() and m4_rshift() below for workarounds. +dnl +dnl ifdef() - OSF 4.0 m4 considers a macro defined to a zero value `0' or +dnl `00' etc as not defined. See m4_ifdef below for a workaround. +dnl +dnl m4wrap() sequence - in BSD m4, m4wrap() replaces any previous m4wrap() +dnl string, in SysV m4 it appends to it, and in GNU m4 it prepends. +dnl See m4wrap_prepend() below which brings uniformity to this. +dnl +dnl m4wrap() 0xFF - old versions of BSD m4 store EOF in a C "char" under an +dnl m4wrap() and on systems where char is unsigned by default a +dnl spurious 0xFF is output. This has been observed on recent Cray +dnl Unicos Alpha, Apple MacOS X, and HPUX 11 systems. An autoconf +dnl test is used to check for this, see the m4wrap handling below. It +dnl might work to end the m4wrap string with a dnl to consume the +dnl 0xFF, but that probably induces the offending m4's to read from an +dnl already closed "FILE *", which could be bad on a glibc style +dnl stdio. +dnl +dnl __file__,__line__ - GNU m4 and OpenBSD 2.7 m4 provide these, and +dnl they're used here to make error messages more informative. GNU m4 +dnl gives an unhelpful "NONE 0" in an m4wrap(), but that's worked +dnl around. +dnl +dnl __file__ quoting - OpenBSD m4, unlike GNU m4, doesn't quote the +dnl filename in __file__, so care should be taken that no macro has +dnl the same name as a file, or an unwanted expansion will occur when +dnl printing an error or warning. +dnl +dnl changecom() - BSD m4 changecom doesn't quite work like the man page +dnl suggests, in particular "changecom" or "changecom()" doesn't +dnl disable the comment feature, and multi-character comment sequences +dnl don't seem to work. If the default `#' and newline aren't +dnl suitable it's necessary to change it to something else, +dnl eg. changecom(;). +dnl +dnl OpenBSD 2.6 m4 - in this m4, eval() rejects decimal constants containing +dnl an 8 or 9, making it pretty much unusable. The bug is confined to +dnl version 2.6 (it's not in 2.5, and was fixed in 2.7). +dnl +dnl SunOS /usr/bin/m4 - this m4 lacks a number of desired features, +dnl including $# and $@, defn(), m4exit(), m4wrap(), pushdef(), +dnl popdef(). /usr/5bin/m4 is a SysV style m4 which should always be +dnl available, and "configure" will reject /usr/bin/m4 in favour of +dnl /usr/5bin/m4 (if necessary). +dnl +dnl The sparc code actually has modest m4 requirements currently and +dnl could manage with /usr/bin/m4, but there's no reason to put our +dnl macros through contortions when /usr/5bin/m4 is available or GNU +dnl m4 can be installed. + + +ifdef(`__ASM_DEFS_M4_INCLUDED__', +`m4_error(`asm-defs.m4 already included, dont include it twice +')m4exit(1)') +define(`__ASM_DEFS_M4_INCLUDED__') + + +dnl Detect and give a message about the unsuitable OpenBSD 2.6 m4. + +ifelse(eval(89),89,, +`errprint( +`This m4 doesnt accept 8 and/or 9 in constants in eval(), making it unusable. +This is probably OpenBSD 2.6 m4 (September 1999). Upgrade to OpenBSD 2.7, +or get a bug fix from the CVS (expr.c rev 1.9), or get GNU m4. Dont forget +to configure with M4=/wherever/m4 if you install one of these in a directory +not in $PATH. +')m4exit(1)') + + +dnl Detect and give a message about the unsuitable SunOS /usr/bin/m4. +dnl +dnl Unfortunately this test doesn't work when m4 is run in the normal way +dnl from mpn/Makefile with "m4 -DOPERATION_foo foo.asm", since the bad m4 +dnl takes "-" in "-D..." to mean read stdin, so it will look like it just +dnl hangs. But running "m4 asm-defs.m4" to try it out will work. +dnl +dnl We'd like to abort immediately on finding a problem, but unfortunately +dnl the bad m4 doesn't have an m4exit(), nor does an invalid eval() kill +dnl it. Unexpanded $#'s in some m4_assert_numargs() later on will comment +dnl out some closing parentheses and kill it with "m4: arg stack overflow". + +define(m4_dollarhash_works_test,``$#'') +ifelse(m4_dollarhash_works_test(x),1,, +`errprint( +`This m4 doesnt support $# and cant be used for GMP asm processing. +If this is on SunOS, ./configure should choose /usr/5bin/m4 if you have that +or can get it, otherwise install GNU m4. Dont forget to configure with +M4=/wherever/m4 if you install in a directory not in $PATH. +')') +undefine(`m4_dollarhash_works_test') + + +dnl -------------------------------------------------------------------------- +dnl Basic error handling things. + + +dnl Usage: m4_dollarhash_1_if_noparen_p +dnl +dnl Expand to 1 if a call "foo" gives $# set to 1 (as opposed to 0 like GNU +dnl and SysV m4 give). + +define(m4_dollarhash_1_if_noparen_test,`$#') +define(m4_dollarhash_1_if_noparen_p, +eval(m4_dollarhash_1_if_noparen_test==1)) +undefine(`m4_dollarhash_1_if_noparen_test') + + +dnl Usage: m4wrap_prepend(string) +dnl +dnl Prepend the given string to what will be expanded under m4wrap at the +dnl end of input. +dnl +dnl This macro exists to work around variations in m4wrap() behaviour in +dnl the various m4s (notes at the start of this file). Don't use m4wrap() +dnl directly since it will interfere with this scheme. + +define(m4wrap_prepend, +m4_assert_numargs(1) +`define(`m4wrap_string',`$1'defn(`m4wrap_string'))') + +define(m4wrap_string,`') + +define(m4wrap_works_p, +`ifelse(M4WRAP_SPURIOUS,yes,0,1)') + +ifelse(m4wrap_works_p,1, +`m4wrap(`m4wrap_string')') + + +dnl Usage: m4_file_and_line +dnl +dnl Expand to the current file and line number, if the GNU m4 extensions +dnl __file__ and __line__ are available. +dnl +dnl In GNU m4 1.4 at the end of input when m4wrap text is expanded, +dnl __file__ is NONE and __line__ is 0, which is not a helpful thing to +dnl print. If m4_file_seen() has been called to note the last file seen, +dnl then that file at a big line number is used, otherwise "end of input" +dnl is used (although "end of input" won't parse as an error message). + +define(m4_file_and_line, +`ifdef(`__file__', +`ifelse(__file__`'__line__,`NONE0', +`ifdef(`m4_file_seen_last',`m4_file_seen_last: 999999: ',`end of input: ')', +`__file__: __line__: ')')') + + +dnl Usage: m4_errprint_commas(arg,...) +dnl +dnl The same as errprint(), but commas are printed between arguments +dnl instead of spaces. + +define(m4_errprint_commas, +`errprint(`$1')dnl +ifelse(eval($#>1),1,`errprint(`,')m4_errprint_commas(shift($@))')') + + +dnl Usage: m4_error(args...) +dnl m4_warning(args...) +dnl +dnl Print an error message, using m4_errprint_commas, prefixed with the +dnl current filename and line number (if available). m4_error sets up to +dnl give an error exit at the end of processing, m4_warning just prints. +dnl These macros are the recommended way to print errors. +dnl +dnl The arguments here should be quoted in the usual way to prevent them +dnl being expanded when the macro call is read. (m4_error takes care not +dnl to do any further expansion.) +dnl +dnl For example, +dnl +dnl m4_error(`some error message +dnl ') +dnl +dnl which prints +dnl +dnl foo.asm:123: some error message +dnl +dnl or if __file__ and __line__ aren't available +dnl +dnl some error message +dnl +dnl The "file:line:" format is a basic style, used by gcc and GNU m4, so +dnl emacs and other editors will recognise it in their normal error message +dnl parsing. + +define(m4_warning, +`m4_errprint_commas(m4_file_and_line`'$@)') + +define(m4_error, +`define(`m4_error_occurred',1)m4_warning($@)dnl +ifelse(m4wrap_works_p,0,`m4exit(1)')') + +define(`m4_error_occurred',0) + +dnl This m4wrap_prepend() is first, so it'll be executed last. +m4wrap_prepend( +`ifelse(m4_error_occurred,1, +`m4_error(`Errors occurred during m4 processing +')m4exit(1)')') + + +dnl Usage: m4_assert_numargs(num) +dnl +dnl Put this unquoted on a line on its own at the start of a macro +dnl definition to add some code to check that num many arguments get passed +dnl to the macro. For example, +dnl +dnl define(foo, +dnl m4_assert_numargs(2) +dnl `something `$1' and `$2' blah blah') +dnl +dnl Then a call like foo(one,two,three) will provoke an error like +dnl +dnl file:10: foo expected 2 arguments, got 3 arguments +dnl +dnl Here are some calls and how many arguments they're interpreted as passing. +dnl +dnl foo(abc,def) 2 +dnl foo(xyz) 1 +dnl foo() 0 +dnl foo -1 +dnl +dnl The -1 for no parentheses at all means a macro that's meant to be used +dnl that way can be checked with m4_assert_numargs(-1). For example, +dnl +dnl define(SPECIAL_SUFFIX, +dnl m4_assert_numargs(-1) +dnl `ifdef(`FOO',`_foo',`_bar')') +dnl +dnl But as an alternative see also deflit() below where parenthesized +dnl expressions following a macro are passed through to the output. +dnl +dnl Note that in BSD m4 there's no way to differentiate calls "foo" and +dnl "foo()", so in BSD m4 the distinction between the two isn't enforced. +dnl (In GNU and SysV m4 it can be checked, and is.) + + +dnl m4_assert_numargs is able to check its own arguments by calling +dnl assert_numargs_internal directly. +dnl +dnl m4_doublequote($`'0) expands to ``$0'', whereas ``$`'0'' would expand +dnl to `$`'0' and do the wrong thing, and likewise for $1. The same is +dnl done in other assert macros. +dnl +dnl $`#' leaves $# in the new macro being defined, and stops # being +dnl interpreted as a comment character. +dnl +dnl `dnl ' means an explicit dnl isn't necessary when m4_assert_numargs is +dnl used. The space means that if there is a dnl it'll still work. + +dnl Usage: m4_doublequote(x) expands to ``x'' +define(m4_doublequote, +`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))``$1''') + +define(m4_assert_numargs, +`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))dnl +`m4_assert_numargs_internal'(m4_doublequote($`'0),$1,$`#',`len'(m4_doublequote($`'1)))`dnl '') + +dnl Called: m4_assert_numargs_internal(`macroname',wantargs,$#,len(`$1')) +define(m4_assert_numargs_internal, +`m4_assert_numargs_internal_check(`$1',`$2',m4_numargs_count(`$3',`$4'))') + +dnl Called: m4_assert_numargs_internal_check(`macroname',wantargs,gotargs) +dnl +dnl If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it +dnl should be -1. If wantargs is -1 but gotargs is 0 and the two can't be +dnl distinguished then it's allowed to pass. +dnl +define(m4_assert_numargs_internal_check, +`ifelse(eval($2 == $3 + || ($2==-1 && $3==0 && m4_dollarhash_1_if_noparen_p)),0, +`m4_error(`$1 expected 'm4_Narguments(`$2')`, got 'm4_Narguments(`$3') +)')') + +dnl Called: m4_numargs_count($#,len(`$1')) +dnl If $#==0 then -1 args, if $#==1 but len(`$1')==0 then 0 args, otherwise +dnl $# args. +define(m4_numargs_count, +`ifelse($1,0, -1, +`ifelse(eval($1==1 && $2-0==0),1, 0, $1)')') + +dnl Usage: m4_Narguments(N) +dnl "$1 argument" or "$1 arguments" with the plural according to $1. +define(m4_Narguments, +`$1 argument`'ifelse(`$1',1,,s)') + + +dnl -------------------------------------------------------------------------- +dnl Additional error checking things. + + +dnl Usage: m4_file_seen() +dnl +dnl Record __file__ for the benefit of m4_file_and_line in m4wrap text. +dnl +dnl The basic __file__ macro comes out quoted in GNU m4, like `foo.asm', +dnl and m4_file_seen_last is defined like that too. +dnl +dnl This is used by PROLOGUE, since that's normally in the main .asm file, +dnl and in particular it sets up m4wrap error checks for missing EPILOGUE. + +define(m4_file_seen, +m4_assert_numargs(0) +`ifelse(__file__,`NONE',, +`define(`m4_file_seen_last',m4_doublequote(__file__))')') + + +dnl Usage: m4_assert_onearg() +dnl +dnl Put this, unquoted, at the start of a macro definition to add some code +dnl to check that one argument is passed to the macro, but with that +dnl argument allowed to be empty. For example, +dnl +dnl define(foo, +dnl m4_assert_onearg() +dnl `blah blah $1 blah blah') +dnl +dnl Calls "foo(xyz)" or "foo()" are accepted. A call "foo(xyz,abc)" fails. +dnl A call "foo" fails too, but BSD m4 can't detect this case (GNU and SysV +dnl m4 can). + +define(m4_assert_onearg, +m4_assert_numargs(0) +`m4_assert_onearg_internal'(m4_doublequote($`'0),$`#')`dnl ') + +dnl Called: m4_assert_onearg(`macroname',$#) +define(m4_assert_onearg_internal, +`ifelse($2,1,, +`m4_error(`$1 expected 1 argument, got 'm4_Narguments(`$2') +)')') + + +dnl Usage: m4_assert_numargs_range(low,high) +dnl +dnl Put this, unquoted, at the start of a macro definition to add some code +dnl to check that between low and high many arguments get passed to the +dnl macro. For example, +dnl +dnl define(foo, +dnl m4_assert_numargs_range(3,5) +dnl `mandatory $1 $2 $3 optional $4 $5 end') +dnl +dnl See m4_assert_numargs() for more info. + +define(m4_assert_numargs_range, +m4_assert_numargs(2) +``m4_assert_numargs_range_internal'(m4_doublequote($`'0),$1,$2,$`#',`len'(m4_doublequote($`'1)))`dnl '') + +dnl Called: m4_assert_numargs_range_internal(`name',low,high,$#,len(`$1')) +define(m4_assert_numargs_range_internal, +m4_assert_numargs(5) +`m4_assert_numargs_range_check(`$1',`$2',`$3',m4_numargs_count(`$4',`$5'))') + +dnl Called: m4_assert_numargs_range_check(`name',low,high,gotargs) +dnl +dnl If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it +dnl should be -1. To ensure a `high' of -1 works, a fudge is applied to +dnl gotargs if it's 0 and the 0 and -1 cases can't be distinguished. +dnl +define(m4_assert_numargs_range_check, +m4_assert_numargs(4) +`ifelse(eval($2 <= $4 && + ($4 - ($4==0 && m4_dollarhash_1_if_noparen_p) <= $3)),0, +`m4_error(`$1 expected $2 to $3 arguments, got 'm4_Narguments(`$4') +)')') + + +dnl Usage: m4_assert_defined(symbol) +dnl +dnl Put this unquoted on a line of its own at the start of a macro +dnl definition to add some code to check that the given symbol is defined +dnl when the macro is used. For example, +dnl +dnl define(foo, +dnl m4_assert_defined(`FOO_PREFIX') +dnl `FOO_PREFIX whatever') +dnl +dnl This is a convenient way to check that the user or ./configure or +dnl whatever has defined the things needed by a macro, as opposed to +dnl silently generating garbage. + +define(m4_assert_defined, +m4_assert_numargs(1) +``m4_assert_defined_internal'(m4_doublequote($`'0),``$1'')`dnl '') + +dnl Called: m4_assert_defined_internal(`macroname',`define_required') +define(m4_assert_defined_internal, +m4_assert_numargs(2) +`m4_ifdef(`$2',, +`m4_error(`$1 needs $2 defined +')')') + + +dnl Usage: m4_not_for_expansion(`SYMBOL') +dnl define_not_for_expansion(`SYMBOL') +dnl +dnl m4_not_for_expansion turns SYMBOL, if defined, into something which +dnl will give an error if expanded. For example, +dnl +dnl m4_not_for_expansion(`PIC') +dnl +dnl define_not_for_expansion is the same, but always makes a definition. +dnl +dnl These are for symbols that should be tested with ifdef(`FOO',...) +dnl rather than be expanded as such. They guard against accidentally +dnl omitting the quotes, as in ifdef(FOO,...). Note though that they only +dnl catches this when FOO is defined, so be sure to test code both with and +dnl without each definition. + +define(m4_not_for_expansion, +m4_assert_numargs(1) +`ifdef(`$1',`define_not_for_expansion(`$1')')') + +define(define_not_for_expansion, +m4_assert_numargs(1) +`ifelse(defn(`$1'),,, +`m4_error(``$1' has a non-empty value, maybe it shouldnt be munged with m4_not_for_expansion() +')')dnl +define(`$1',`m4_not_for_expansion_internal(`$1')')') + +define(m4_not_for_expansion_internal, +`m4_error(``$1' is not meant to be expanded, perhaps you mean `ifdef(`$1',...)' +')') + + +dnl -------------------------------------------------------------------------- +dnl Various generic m4 things. + + +dnl Usage: m4_unquote(macro) +dnl +dnl Allow the argument text to be re-evaluated. This is useful for "token +dnl pasting" like m4_unquote(foo`'bar). + +define(m4_unquote, +m4_assert_onearg() +`$1') + + +dnl Usage: m4_ifdef(name,yes[,no]) +dnl +dnl Expand to the yes argument if name is defined, or to the no argument if +dnl not. +dnl +dnl This is the same as the builtin "ifdef", but avoids an OSF 4.0 m4 bug +dnl in which a macro with a zero value `0' or `00' etc is considered not +dnl defined. +dnl +dnl There's no particular need to use this everywhere, only if there might +dnl be a zero value. + +define(m4_ifdef, +m4_assert_numargs_range(2,3) +`ifelse(eval(ifdef(`$1',1,0)+m4_length(defn(`$1'))),0, +`$3',`$2')') + + +dnl Usage: m4_ifdef_anyof_p(`symbol',...) +dnl +dnl Expand to 1 if any of the symbols in the argument list are defined, or +dnl to 0 if not. + +define(m4_ifdef_anyof_p, +`ifelse(eval($#<=1 && m4_length(`$1')==0),1, 0, +`ifdef(`$1', 1, +`m4_ifdef_anyof_p(shift($@))')')') + + +dnl Usage: m4_length(string) +dnl +dnl Determine the length of a string. This is the same as len(), but +dnl always expands to a number, working around the BSD len() which +dnl evaluates to nothing given an empty argument. + +define(m4_length, +m4_assert_onearg() +`eval(len(`$1')-0)') + + +dnl Usage: m4_stringequal_p(x,y) +dnl +dnl Expand to 1 or 0 according as strings x and y are equal or not. + +define(m4_stringequal_p, +`ifelse(`$1',`$2',1,0)') + + +dnl Usage: m4_incr_or_decr(n,last) +dnl +dnl Do an incr(n) or decr(n), whichever is in the direction of "last". +dnl Both n and last must be numbers of course. + +define(m4_incr_or_decr, +m4_assert_numargs(2) +`ifelse(eval($1<$2),1,incr($1),decr($1))') + + +dnl Usage: forloop(i, first, last, statement) +dnl +dnl Based on GNU m4 examples/forloop.m4, but extended. +dnl +dnl statement is expanded repeatedly, with i successively defined as +dnl +dnl first, first+1, ..., last-1, last +dnl +dnl Or if first > last, then it's +dnl +dnl first, first-1, ..., last+1, last +dnl +dnl If first == last, then one expansion is done. +dnl +dnl A pushdef/popdef of i is done to preserve any previous definition (or +dnl lack of definition). first and last are eval()ed and so can be +dnl expressions. +dnl +dnl forloop_first is defined to 1 on the first iteration, 0 on the rest. +dnl forloop_last is defined to 1 on the last iteration, 0 on the others. +dnl Nested forloops are allowed, in which case forloop_first and +dnl forloop_last apply to the innermost loop that's open. +dnl +dnl A simple example, +dnl +dnl forloop(i, 1, 2*2+1, `dnl +dnl iteration number i ... ifelse(forloop_first,1,FIRST) +dnl ') + + +dnl "i" and "statement" are carefully quoted, but "first" and "last" are +dnl just plain numbers once eval()ed. + +define(`forloop', +m4_assert_numargs(4) +`pushdef(`$1',eval(`$2'))dnl +pushdef(`forloop_first',1)dnl +pushdef(`forloop_last',0)dnl +forloop_internal(`$1',eval(`$3'),`$4')`'dnl +popdef(`forloop_first')dnl +popdef(`forloop_last')dnl +popdef(`$1')') + +dnl Called: forloop_internal(`var',last,statement) +define(`forloop_internal', +m4_assert_numargs(3) +`ifelse($1,$2, +`define(`forloop_last',1)$3', +`$3`'dnl +define(`forloop_first',0)dnl +define(`$1',m4_incr_or_decr($1,$2))dnl +forloop_internal(`$1',$2,`$3')')') + + +dnl Usage: foreach(var,body, item1,item2,...,itemN) +dnl +dnl For each "item" argument, define "var" to that value and expand "body". +dnl For example, +dnl +dnl foreach(i, `something i +dnl ', one, two) +dnl gives +dnl something one +dnl something two +dnl +dnl Any previous definition of "var", or lack thereof, is saved and +dnl restored. Empty "item"s are not allowed. + +define(foreach, +m4_assert_numargs_range(2,1000) +`ifelse(`$3',,, +`pushdef(`$1',`$3')$2`'popdef(`$1')dnl +foreach(`$1',`$2',shift(shift(shift($@))))')') + + +dnl Usage: m4_toupper(x) +dnl m4_tolower(x) +dnl +dnl Convert the argument string to upper or lower case, respectively. +dnl Only one argument accepted. +dnl +dnl BSD m4 doesn't take ranges like a-z in translit(), so the full alphabet +dnl is written out. + +define(m4_alphabet_lower, `abcdefghijklmnopqrstuvwxyz') +define(m4_alphabet_upper, `ABCDEFGHIJKLMNOPQRSTUVWXYZ') + +define(m4_toupper, +m4_assert_onearg() +`translit(`$1', m4_alphabet_lower, m4_alphabet_upper)') + +define(m4_tolower, +m4_assert_onearg() +`translit(`$1', m4_alphabet_upper, m4_alphabet_lower)') + + +dnl Usage: m4_empty_if_zero(x) +dnl +dnl Evaluate to x, or to nothing if x is 0. x is eval()ed and so can be an +dnl expression. +dnl +dnl This is useful for x86 addressing mode displacements since forms like +dnl (%ebx) are one byte shorter than 0(%ebx). A macro `foo' for use as +dnl foo(%ebx) could be defined with the following so it'll be empty if the +dnl expression comes out zero. +dnl +dnl deflit(`foo', `m4_empty_if_zero(a+b*4-c)') +dnl +dnl Naturally this shouldn't be done if, say, a computed jump depends on +dnl the code being a particular size. + +define(m4_empty_if_zero, +m4_assert_onearg() +`ifelse(eval($1),0,,eval($1))') + + +dnl Usage: m4_log2(x) +dnl +dnl Calculate a logarithm to base 2. +dnl x must be an integral power of 2, between 2**0 and 2**30. +dnl x is eval()ed, so it can be an expression. +dnl An error results if x is invalid. +dnl +dnl 2**31 isn't supported, because an unsigned 2147483648 is out of range +dnl of a 32-bit signed int. Also, the bug in BSD m4 where an eval() +dnl resulting in 2147483648 (or -2147483648 as the case may be) gives `-(' +dnl means tests like eval(1<<31==(x)) would be necessary, but that then +dnl gives an unattractive explosion of eval() error messages if x isn't +dnl numeric. + +define(m4_log2, +m4_assert_numargs(1) +`m4_log2_internal(0,1,eval(`$1'))') + +dnl Called: m4_log2_internal(n,2**n,target) +define(m4_log2_internal, +m4_assert_numargs(3) +`ifelse($2,$3,$1, +`ifelse($1,30, +`m4_error(`m4_log2() argument too big or not a power of two: $3 +')', +`m4_log2_internal(incr($1),eval(2*$2),$3)')')') + + +dnl Usage: m4_div2_towards_zero +dnl +dnl m4 division is probably whatever a C signed division is, and C doesn't +dnl specify what rounding gets used on negatives, so this expression forces +dnl a rounding towards zero. + +define(m4_div2_towards_zero, +m4_assert_numargs(1) +`eval((($1) + ((($1)<0) & ($1))) / 2)') + + +dnl Usage: m4_lshift(n,count) +dnl m4_rshift(n,count) +dnl +dnl Calculate n shifted left or right by count many bits. Both n and count +dnl are eval()ed and so can be expressions. +dnl +dnl Negative counts are allowed and mean a shift in the opposite direction. +dnl Negative n is allowed and right shifts will be arithmetic (meaning +dnl divide by 2**count, rounding towards zero, also meaning the sign bit is +dnl duplicated). +dnl +dnl Use these macros instead of << and >> in eval() since the basic ccs +dnl SysV m4 doesn't have those operators. + +define(m4_rshift, +m4_assert_numargs(2) +`m4_lshift(`$1',-(`$2'))') + +define(m4_lshift, +m4_assert_numargs(2) +`m4_lshift_internal(eval(`$1'),eval(`$2'))') + +define(m4_lshift_internal, +m4_assert_numargs(2) +`ifelse(eval($2-0==0),1,$1, +`ifelse(eval($2>0),1, +`m4_lshift_internal(eval($1*2),decr($2))', +`m4_lshift_internal(m4_div2_towards_zero($1),incr($2))')')') + + +dnl Usage: m4_popcount(n) +dnl +dnl Expand to the number 1 bits in n. + +define(m4_popcount, +m4_assert_numargs(1) +`m4_popcount_internal(0,eval(`$1'))') + +dnl Called: m4_popcount_internal(count,rem) +define(m4_popcount_internal, +m4_assert_numargs(2) +`ifelse($2,0,$1, +`m4_popcount_internal(eval($1+($2%2)),eval($2/2))')') + + +dnl Usage: m4_count_trailing_zeros(N) +dnl +dnl Determine the number of trailing zero bits on N. N is eval()ed and so +dnl can be an expression. If N is zero an error is generated. + +define(m4_count_trailing_zeros, +m4_assert_numargs(1) +`m4_count_trailing_zeros_internal(eval(`$1'),0)') + +dnl Called: m4_count_trailing_zeros_internal(val,count) +define(m4_count_trailing_zeros_internal, +m4_assert_numargs(2) +`ifelse($1,0, +`m4_error(`m4_count_trailing_zeros() given a zero value')', +`ifelse(eval(($1)%2),1,`$2', +`m4_count_trailing_zeros_internal(eval($1/2),incr($2))')')') + + +dnl Usage: deflit(name,value) +dnl +dnl Like define(), but "name" expands like a literal, rather than taking +dnl arguments. For example "name(%eax)" expands to "value(%eax)". +dnl +dnl Limitations: +dnl +dnl $ characters in the value part must have quotes to stop them looking +dnl like macro parameters. For example, deflit(reg,`123+$`'4+567'). See +dnl defreg() below for handling simple register definitions like $7 etc. +dnl +dnl "name()" is turned into "name", unfortunately. In GNU and SysV m4 an +dnl error is generated when this happens, but in BSD m4 it will happen +dnl silently. The problem is that in BSD m4 $# is 1 in both "name" or +dnl "name()", so there's no way to differentiate them. Because we want +dnl plain "name" to turn into plain "value", we end up with "name()" +dnl turning into plain "value" too. +dnl +dnl "name(foo)" will lose any whitespace after commas in "foo", for example +dnl "disp(%eax, %ecx)" would become "128(%eax,%ecx)". +dnl +dnl These parentheses oddities shouldn't matter in assembler text, but if +dnl they do the suggested workaround is to write "name ()" or "name (foo)" +dnl to stop the parentheses looking like a macro argument list. If a space +dnl isn't acceptable in the output, then write "name`'()" or "name`'(foo)". +dnl The `' is stripped when read, but again stops the parentheses looking +dnl like parameters. + +dnl Quoting for deflit_emptyargcheck is similar to m4_assert_numargs. The +dnl stuff in the ifelse gives a $#, $1 and $@ evaluated in the new macro +dnl created, not in deflit. +define(deflit, +m4_assert_numargs(2) +`define(`$1', +`deflit_emptyargcheck'(``$1'',$`#',m4_doublequote($`'1))`dnl +$2`'dnl +ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')') + +dnl Called: deflit_emptyargcheck(macroname,$#,`$1') +define(deflit_emptyargcheck, +`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1, +`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-defs.m4 for more information) +')')') + + +dnl Usage: m4_assert(`expr') +dnl +dnl Test a compile-time requirement with an m4 expression. The expression +dnl should be quoted, and will be eval()ed and expected to be non-zero. +dnl For example, +dnl +dnl m4_assert(`FOO*2+6 < 14') + +define(m4_assert, +m4_assert_numargs(1) +`ifelse(eval($1),1,, +`m4_error(`assertion failed: $1 +')')') + + +dnl Usage: m4_repeat(count,text) +dnl +dnl Expand to the given repetitions of the given text. A zero count is +dnl allowed, and expands to nothing. + +define(m4_repeat, +m4_assert_numargs(2) +`m4_repeat_internal(eval($1),`$2')') + +define(m4_repeat_internal, +m4_assert_numargs(2) +`ifelse(`$1',0,, +`forloop(m4_repeat_internal_counter,1,$1,``$2'')')') + + +dnl Usage: m4_hex_lowmask(bits) +dnl +dnl Generate a hex constant which is a low mask of the given number of +dnl bits. For example m4_hex_lowmask(10) would give 0x3ff. + +define(m4_hex_lowmask, +m4_assert_numargs(1) +`m4_cpu_hex_constant(m4_hex_lowmask_internal1(eval(`$1')))') + +dnl Called: m4_hex_lowmask_internal1(bits) +define(m4_hex_lowmask_internal1, +m4_assert_numargs(1) +`ifelse($1,0,`0', +`m4_hex_lowmask_internal2(eval(($1)%4),eval(($1)/4))')') + +dnl Called: m4_hex_lowmask_internal(remainder,digits) +define(m4_hex_lowmask_internal2, +m4_assert_numargs(2) +`ifelse($1,1,`1', +`ifelse($1,2,`3', +`ifelse($1,3,`7')')')dnl +m4_repeat($2,`f')') + + +dnl -------------------------------------------------------------------------- +dnl The following m4_list functions take a list as multiple arguments. +dnl Arguments are evaluated multiple times, there's no attempt at strict +dnl quoting. Empty list elements are not allowed, since an empty final +dnl argument is ignored. These restrictions don't affect the current uses, +dnl and make the implementation easier. + + +dnl Usage: m4_list_quote(list,...) +dnl +dnl Produce a list with quoted commas, so it can be a single argument +dnl string. For instance m4_list_quote(a,b,c) gives +dnl +dnl a`,'b`,'c`,' +dnl +dnl This can be used to put a list in a define, +dnl +dnl define(foolist, m4_list_quote(a,b,c)) +dnl +dnl Which can then be used for instance as +dnl +dnl m4_list_find(target, foolist) + +define(m4_list_quote, +`ifelse(`$1',,, +`$1`,'m4_list_quote(shift($@))')') + + +dnl Usage: m4_list_find(key,list,...) +dnl +dnl Evaluate to 1 or 0 according to whether key is in the list elements. + +define(m4_list_find, +m4_assert_numargs_range(1,1000) +`ifelse(`$2',,0, +`ifelse(`$1',`$2',1, +`m4_list_find(`$1',shift(shift($@)))')')') + + +dnl Usage: m4_list_remove(key,list,...) +dnl +dnl Evaluate to the given list with `key' removed (if present). + +define(m4_list_remove, +m4_assert_numargs_range(1,1000) +`ifelse(`$2',,, +`ifelse(`$1',`$2',,`$2,')dnl +m4_list_remove(`$1',shift(shift($@)))')') + + +dnl Usage: m4_list_first(list,...) +dnl +dnl Evaluate to the first element of the list (if any). + +define(m4_list_first,`$1') + + +dnl Usage: m4_list_count(list,...) +dnl +dnl Evaluate to the number of elements in the list. This can't just use $# +dnl because the last element might be empty. + +define(m4_list_count, +`m4_list_count_internal(0,$@)') + +dnl Called: m4_list_internal(count,list,...) +define(m4_list_count_internal, +m4_assert_numargs_range(1,1000) +`ifelse(`$2',,$1, +`m4_list_count_internal(eval($1+1),shift(shift($@)))')') + + +dnl -------------------------------------------------------------------------- +dnl Various assembler things, not specific to any particular CPU. +dnl + + +dnl Usage: include_mpn(`filename') +dnl +dnl Like include(), but adds a path to the mpn source directory. For +dnl example, +dnl +dnl include_mpn(`sparc64/addmul_1h.asm') + +define(include_mpn, +m4_assert_numargs(1) +m4_assert_defined(`CONFIG_TOP_SRCDIR') +`include(CONFIG_TOP_SRCDIR`/mpn/$1')') + + +dnl Usage: C comment ... +dnl +dnl This works like a FORTRAN-style comment character. It can be used for +dnl comments to the right of assembly instructions, where just dnl would +dnl remove the newline and concatenate adjacent lines. +dnl +dnl C and/or dnl are useful when an assembler doesn't support comments, or +dnl where different assemblers for a particular CPU need different styles. +dnl The intermediate ".s" files will end up with no comments, just code. +dnl +dnl Using C is not intended to cause offence to anyone who doesn't like +dnl FORTRAN; but if that happens it's an unexpected bonus. +dnl +dnl During development, if comments are wanted in the .s files to help see +dnl what's expanding where, C can be redefined with something like +dnl +dnl define(`C',`#') + +define(C, ` +dnl') + + +dnl Normally PIC is defined (or not) by libtool, but it doesn't set it on +dnl systems which are always PIC. PIC_ALWAYS established in config.m4 +dnl identifies these for us. + +ifelse(PIC_ALWAYS,`yes',`define(`PIC')') + + +dnl Various possible defines passed from the Makefile that are to be tested +dnl with ifdef() rather than be expanded. + +m4_not_for_expansion(`PIC') +m4_not_for_expansion(`DLL_EXPORT') + +dnl aors_n +m4_not_for_expansion(`OPERATION_add_n') +m4_not_for_expansion(`OPERATION_sub_n') + +dnl aors_err1_n +m4_not_for_expansion(`OPERATION_add_err1_n') +m4_not_for_expansion(`OPERATION_sub_err1_n') + +dnl aors_err2_n +m4_not_for_expansion(`OPERATION_add_err2_n') +m4_not_for_expansion(`OPERATION_sub_err2_n') + +dnl aors_err3_n +m4_not_for_expansion(`OPERATION_add_err3_n') +m4_not_for_expansion(`OPERATION_sub_err3_n') + +dnl aorsmul_1 +m4_not_for_expansion(`OPERATION_addmul_1') +m4_not_for_expansion(`OPERATION_submul_1') + +dnl logops_n +m4_not_for_expansion(`OPERATION_and_n') +m4_not_for_expansion(`OPERATION_andn_n') +m4_not_for_expansion(`OPERATION_nand_n') +m4_not_for_expansion(`OPERATION_ior_n') +m4_not_for_expansion(`OPERATION_iorn_n') +m4_not_for_expansion(`OPERATION_nior_n') +m4_not_for_expansion(`OPERATION_xor_n') +m4_not_for_expansion(`OPERATION_xnor_n') + +dnl popham +m4_not_for_expansion(`OPERATION_popcount') +m4_not_for_expansion(`OPERATION_hamdist') + +dnl lorrshift +m4_not_for_expansion(`OPERATION_lshift') +m4_not_for_expansion(`OPERATION_rshift') + +dnl aorslsh1_n +m4_not_for_expansion(`OPERATION_addlsh1_n') +m4_not_for_expansion(`OPERATION_sublsh1_n') +m4_not_for_expansion(`OPERATION_rsblsh1_n') + +dnl aorslsh2_n +m4_not_for_expansion(`OPERATION_addlsh2_n') +m4_not_for_expansion(`OPERATION_sublsh2_n') +m4_not_for_expansion(`OPERATION_rsblsh2_n') + +dnl rsh1aors_n +m4_not_for_expansion(`OPERATION_rsh1add_n') +m4_not_for_expansion(`OPERATION_rsh1sub_n') + + +dnl Usage: m4_config_gmp_mparam(`symbol') +dnl +dnl Check that `symbol' is defined. If it isn't, issue an error and +dnl terminate immediately. The error message explains that the symbol +dnl should be in config.m4, copied from gmp-mparam.h. +dnl +dnl Termination is immediate since missing say SQR_TOOM2_THRESHOLD can +dnl lead to infinite loops and endless error messages. + +define(m4_config_gmp_mparam, +m4_assert_numargs(1) +`ifdef(`$1',, +`m4_error(`$1 is not defined. + "configure" should have extracted this from gmp-mparam.h and put it + in config.m4 (or in _.asm for a fat binary), but somehow + this has failed. +')m4exit(1)')') + + +dnl Usage: defreg(name,reg) +dnl +dnl Give a name to a $ style register. For example, +dnl +dnl defreg(foo,$12) +dnl +dnl defreg() inserts an extra pair of quotes after the $ so that it's not +dnl interpreted as an m4 macro parameter, ie. foo is actually $`'12. m4 +dnl strips those quotes when foo is expanded. +dnl +dnl deflit() is used to make the new definition, so it will expand +dnl literally even if followed by parentheses ie. foo(99) will become +dnl $12(99). (But there's nowhere that would be used is there?) +dnl +dnl When making further definitions from existing defreg() macros, remember +dnl to use defreg() again to protect the $ in the new definitions too. For +dnl example, +dnl +dnl defreg(a0,$4) +dnl defreg(a1,$5) +dnl ... +dnl +dnl defreg(PARAM_DST,a0) +dnl +dnl This is only because a0 is expanding at the time the PARAM_DST +dnl definition is made, leaving a literal $4 that must be re-quoted. On +dnl the other hand in something like the following ra is only expanded when +dnl ret is used and its $`'31 protection will have its desired effect at +dnl that time. +dnl +dnl defreg(ra,$31) +dnl ... +dnl define(ret,`j ra') +dnl +dnl Note that only $n forms are meant to be used here, and something like +dnl 128($30) doesn't get protected and will come out wrong. + +define(defreg, +m4_assert_numargs(2) +`deflit(`$1', +substr(`$2',0,1)``''substr(`$2',1))') + + +dnl Usage: m4_instruction_wrapper() +dnl +dnl Put this, unquoted, on a line on its own, at the start of a macro +dnl that's a wrapper around an assembler instruction. It adds code to give +dnl a descriptive error message if the macro is invoked without arguments. +dnl +dnl For example, suppose jmp needs to be wrapped, +dnl +dnl define(jmp, +dnl m4_instruction_wrapper() +dnl m4_assert_numargs(1) +dnl `.byte 0x42 +dnl .long $1 +dnl nop') +dnl +dnl The point of m4_instruction_wrapper is to get a better error message +dnl than m4_assert_numargs would give if jmp is accidentally used as plain +dnl "jmp foo" instead of the intended "jmp( foo)". "jmp()" with no +dnl argument also provokes the error message. +dnl +dnl m4_instruction_wrapper should only be used with wrapped instructions +dnl that take arguments, since obviously something meant to be used as say +dnl plain "ret" doesn't want to give an error when used that way. + +define(m4_instruction_wrapper, +m4_assert_numargs(0) +``m4_instruction_wrapper_internal'(m4_doublequote($`'0),dnl +ifdef(`__file__',`m4_doublequote(__file__)',``the m4 sources''),dnl +$`#',m4_doublequote($`'1))`dnl'') + +dnl Called: m4_instruction_wrapper_internal($0,`filename',$#,$1) +define(m4_instruction_wrapper_internal, +`ifelse(eval($3<=1 && m4_length(`$4')==0),1, +`m4_error(`$1 is a macro replacing that instruction and needs arguments, see $2 for details +')')') + + +dnl Usage: m4_cpu_hex_constant(string) +dnl +dnl Expand to the string prefixed by a suitable `0x' hex marker. This +dnl should be redefined as necessary for CPUs with different conventions. + +define(m4_cpu_hex_constant, +m4_assert_numargs(1) +`0x`$1'') + + +dnl Usage: UNROLL_LOG2, UNROLL_MASK, UNROLL_BYTES +dnl CHUNK_LOG2, CHUNK_MASK, CHUNK_BYTES +dnl +dnl When code supports a variable amount of loop unrolling, the convention +dnl is to define UNROLL_COUNT to the number of limbs processed per loop. +dnl When testing code this can be varied to see how much the loop overhead +dnl is costing. For example, +dnl +dnl deflit(UNROLL_COUNT, 32) +dnl +dnl If the forloop() generating the unrolled loop has a pattern processing +dnl more than one limb, the convention is to express this with CHUNK_COUNT. +dnl For example, +dnl +dnl deflit(CHUNK_COUNT, 2) +dnl +dnl The LOG2, MASK and BYTES definitions below are derived from these COUNT +dnl definitions. If COUNT is redefined, the LOG2, MASK and BYTES follow +dnl the new definition automatically. +dnl +dnl LOG2 is the log base 2 of COUNT. MASK is COUNT-1, which can be used as +dnl a bit mask. BYTES is GMP_LIMB_BYTES*COUNT, the number of bytes +dnl processed in each unrolled loop. +dnl +dnl GMP_LIMB_BYTES is defined in a CPU specific m4 include file. It +dnl exists only so the BYTES definitions here can be common to all CPUs. +dnl In the actual code for a given CPU, an explicit 4 or 8 may as well be +dnl used because the code is only for a particular CPU, it doesn't need to +dnl be general. +dnl +dnl Note that none of these macros do anything except give conventional +dnl names to commonly used things. You still have to write your own +dnl expressions for a forloop() and the resulting address displacements. +dnl Something like the following would be typical for 4 bytes per limb. +dnl +dnl forloop(`i',0,UNROLL_COUNT-1,` +dnl deflit(`disp',eval(i*4)) +dnl ... +dnl ') +dnl +dnl Or when using CHUNK_COUNT, +dnl +dnl forloop(`i',0,UNROLL_COUNT/CHUNK_COUNT-1,` +dnl deflit(`disp0',eval(i*CHUNK_COUNT*4)) +dnl deflit(`disp1',eval(disp0+4)) +dnl ... +dnl ') +dnl +dnl Clearly `i' can be run starting from 1, or from high to low or whatever +dnl best suits. + +deflit(UNROLL_LOG2, +m4_assert_defined(`UNROLL_COUNT') +`m4_log2(UNROLL_COUNT)') + +deflit(UNROLL_MASK, +m4_assert_defined(`UNROLL_COUNT') +`eval(UNROLL_COUNT-1)') + +deflit(UNROLL_BYTES, +m4_assert_defined(`UNROLL_COUNT') +m4_assert_defined(`GMP_LIMB_BYTES') +`eval(UNROLL_COUNT * GMP_LIMB_BYTES)') + +deflit(CHUNK_LOG2, +m4_assert_defined(`CHUNK_COUNT') +`m4_log2(CHUNK_COUNT)') + +deflit(CHUNK_MASK, +m4_assert_defined(`CHUNK_COUNT') +`eval(CHUNK_COUNT-1)') + +deflit(CHUNK_BYTES, +m4_assert_defined(`CHUNK_COUNT') +m4_assert_defined(`GMP_LIMB_BYTES') +`eval(CHUNK_COUNT * GMP_LIMB_BYTES)') + + +dnl Usage: MPN(name) +dnl +dnl Add MPN_PREFIX to a name. +dnl MPN_PREFIX defaults to "__gmpn_" if not defined. +dnl +dnl m4_unquote is used in MPN so that when it expands to say __gmpn_foo, +dnl that identifier will be subject to further macro expansion. This is +dnl used by some of the fat binary support for renaming symbols. + +ifdef(`MPN_PREFIX',, +`define(`MPN_PREFIX',`__gmpn_')') + +define(MPN, +m4_assert_numargs(1) +`m4_unquote(MPN_PREFIX`'$1)') + + +dnl Usage: mpn_add_n, etc +dnl +dnl Convenience definitions using MPN(), like the #defines in gmp.h. Each +dnl function that might be implemented in assembler is here. + +define(define_mpn, +m4_assert_numargs(1) +`deflit(`mpn_$1',`MPN(`$1')')') + +define_mpn(add) +define_mpn(add_1) +define_mpn(add_err1_n) +define_mpn(add_err2_n) +define_mpn(add_err3_n) +define_mpn(add_n) +define_mpn(add_nc) +define_mpn(addlsh1_n) +define_mpn(addlsh1_nc) +define_mpn(addlsh2_n) +define_mpn(addlsh2_nc) +define_mpn(addlsh_n) +define_mpn(addlsh_nc) +define_mpn(addlsh1_n_ip1) +define_mpn(addlsh1_nc_ip1) +define_mpn(addlsh2_n_ip1) +define_mpn(addlsh2_nc_ip1) +define_mpn(addlsh_n_ip1) +define_mpn(addlsh_nc_ip1) +define_mpn(addlsh1_n_ip2) +define_mpn(addlsh1_nc_ip2) +define_mpn(addlsh2_n_ip2) +define_mpn(addlsh2_nc_ip2) +define_mpn(addlsh_n_ip2) +define_mpn(addlsh_nc_ip2) +define_mpn(addmul_1) +define_mpn(addmul_1c) +define_mpn(addmul_2) +define_mpn(addmul_3) +define_mpn(addmul_4) +define_mpn(addmul_5) +define_mpn(addmul_6) +define_mpn(addmul_7) +define_mpn(addmul_8) +define_mpn(addmul_2s) +define_mpn(add_n_sub_n) +define_mpn(add_n_sub_nc) +define_mpn(addaddmul_1msb0) +define_mpn(and_n) +define_mpn(andn_n) +define_mpn(bdiv_q_1) +define_mpn(pi1_bdiv_q_1) +define_mpn(bdiv_dbm1c) +define_mpn(cmp) +define_mpn(cnd_add_n) +define_mpn(cnd_sub_n) +define_mpn(com) +define_mpn(copyd) +define_mpn(copyi) +define_mpn(count_leading_zeros) +define_mpn(count_trailing_zeros) +define_mpn(div_qr_1n_pi1) +define_mpn(div_qr_2) +define_mpn(div_qr_2n_pi1) +define_mpn(div_qr_2u_pi1) +define_mpn(div_qr_2n_pi2) +define_mpn(div_qr_2u_pi2) +define_mpn(divexact_1) +define_mpn(divexact_by3c) +define_mpn(divrem) +define_mpn(divrem_1) +define_mpn(divrem_1c) +define_mpn(divrem_2) +define_mpn(divrem_classic) +define_mpn(divrem_newton) +define_mpn(dump) +define_mpn(gcd) +define_mpn(gcd_1) +define_mpn(gcd_11) +define_mpn(gcd_22) +define_mpn(gcdext) +define_mpn(get_str) +define_mpn(hamdist) +define_mpn(invert_limb) +define_mpn(invert_limb_table) +define_mpn(ior_n) +define_mpn(iorn_n) +define_mpn(lshift) +define_mpn(lshiftc) +define_mpn(mod_1_1p) +define_mpn(mod_1_1p_cps) +define_mpn(mod_1s_2p) +define_mpn(mod_1s_2p_cps) +define_mpn(mod_1s_3p) +define_mpn(mod_1s_3p_cps) +define_mpn(mod_1s_4p) +define_mpn(mod_1s_4p_cps) +define_mpn(mod_1) +define_mpn(mod_1c) +define_mpn(mod_34lsub1) +define_mpn(modexact_1_odd) +define_mpn(modexact_1c_odd) +define_mpn(mul) +define_mpn(mul_1) +define_mpn(mul_1c) +define_mpn(mul_2) +define_mpn(mul_3) +define_mpn(mul_4) +define_mpn(mul_5) +define_mpn(mul_6) +define_mpn(mul_basecase) +define_mpn(mul_n) +define_mpn(mullo_basecase) +define_mpn(mulmid_basecase) +define_mpn(perfect_square_p) +define_mpn(popcount) +define_mpn(preinv_divrem_1) +define_mpn(preinv_mod_1) +define_mpn(nand_n) +define_mpn(neg) +define_mpn(nior_n) +define_mpn(powm) +define_mpn(powlo) +define_mpn(random) +define_mpn(random2) +define_mpn(redc_1) +define_mpn(redc_2) +define_mpn(rsblsh1_n) +define_mpn(rsblsh1_nc) +define_mpn(rsblsh2_n) +define_mpn(rsblsh2_nc) +define_mpn(rsblsh_n) +define_mpn(rsblsh_nc) +define_mpn(rsh1add_n) +define_mpn(rsh1add_nc) +define_mpn(rsh1sub_n) +define_mpn(rsh1sub_nc) +define_mpn(rshift) +define_mpn(rshiftc) +define_mpn(sbpi1_bdiv_q) +define_mpn(sbpi1_bdiv_qr) +define_mpn(sbpi1_bdiv_r) +define_mpn(scan0) +define_mpn(scan1) +define_mpn(set_str) +define_mpn(sqr_basecase) +define_mpn(sqr_diagonal) +define_mpn(sqr_diag_addlsh1) +define_mpn(sub_n) +define_mpn(sublsh1_n) +define_mpn(sublsh1_nc) +define_mpn(sublsh1_n_ip1) +define_mpn(sublsh1_nc_ip1) +define_mpn(sublsh2_n) +define_mpn(sublsh2_nc) +define_mpn(sublsh2_n_ip1) +define_mpn(sublsh2_nc_ip1) +define_mpn(sublsh_n) +define_mpn(sublsh_nc) +define_mpn(sublsh_n_ip1) +define_mpn(sublsh_nc_ip1) +define_mpn(sqrtrem) +define_mpn(sub) +define_mpn(sub_1) +define_mpn(sub_err1_n) +define_mpn(sub_err2_n) +define_mpn(sub_err3_n) +define_mpn(sub_n) +define_mpn(sub_nc) +define_mpn(submul_1) +define_mpn(submul_1c) +define_mpn(sec_tabselect) +define_mpn(umul_ppmm) +define_mpn(umul_ppmm_r) +define_mpn(udiv_qrnnd) +define_mpn(udiv_qrnnd_r) +define_mpn(xnor_n) +define_mpn(xor_n) + + +dnl Defines for C global arrays and variables, with names matching what's +dnl used in the C code. +dnl +dnl Notice that GSYM_PREFIX is included, unlike with the function defines +dnl above. Also, "deflit" is used so that something like __clz_tab(%ebx) +dnl comes out as __gmpn_clz_tab(%ebx), for the benefit of CPUs with that +dnl style assembler syntax. + +deflit(__clz_tab, +m4_assert_defined(`GSYM_PREFIX') +`GSYM_PREFIX`'MPN(`clz_tab')') + +deflit(binvert_limb_table, +m4_assert_defined(`GSYM_PREFIX') +`GSYM_PREFIX`'__gmp_binvert_limb_table') + + +dnl Usage: ASM_START() +dnl +dnl Emit any directives needed once at the start of an assembler file, like +dnl ".set noreorder" or whatever. The default for this is nothing, but +dnl it's redefined by CPU specific m4 files. + +define(ASM_START) + + +dnl Usage: ASM_END() +dnl +dnl Emit any directives needed once at the end of an assembler file. The +dnl default for this is nothing, but it's redefined by CPU specific m4 files. + +define(ASM_END) + + +dnl Usage: PROLOGUE(foo[,param]) +dnl EPILOGUE(foo) +dnl +dnl Emit directives to start or end a function. GSYM_PREFIX is added by +dnl these macros if necessary, so the given "foo" is what the function will +dnl be called in C. +dnl +dnl The second parameter to PROLOGUE is used only for some CPUs and should +dnl be omitted if not required. +dnl +dnl Nested or overlapping PROLOGUE/EPILOGUE pairs are allowed, if that +dnl makes sense for the system. The name given to EPILOGUE must be a +dnl currently open PROLOGUE. +dnl +dnl If only one PROLOGUE is open then the name can be omitted from +dnl EPILOGUE. This is encouraged, since it means the name only has to +dnl appear in one place, not two. +dnl +dnl The given name "foo" is not fully quoted here, it will be macro +dnl expanded more than once. This is the way the m4_list macros work, and +dnl it also helps the tune/many.pl program do a renaming like +dnl -D__gmpn_add_n=mpn_add_n_foo when GSYM_PREFIX is not empty. + +define(PROLOGUE, +m4_assert_numargs_range(1,2) +`m4_file_seen()dnl +define(`PROLOGUE_list',m4_list_quote($1,PROLOGUE_list))dnl +ifelse(`$2',, +`PROLOGUE_cpu(GSYM_PREFIX`'$1)', +`PROLOGUE_cpu(GSYM_PREFIX`'$1,`$2')')') + +define(EPILOGUE, +m4_assert_numargs_range(0,1) +`ifelse(`$1',, +`ifelse(m4_list_count(PROLOGUE_list),0, +`m4_error(`no open functions for EPILOGUE +')', +`ifelse(m4_list_count(PROLOGUE_list),1, +`EPILOGUE_internal(PROLOGUE_current_function)', +`m4_error(`more than one open function for EPILOGUE +')')')', +`EPILOGUE_internal(`$1')')') + +define(EPILOGUE_internal, +m4_assert_numargs(1) +m4_assert_defined(`EPILOGUE_cpu') +`ifelse(m4_list_find($1,PROLOGUE_list),0, +`m4_error(`EPILOGUE without PROLOGUE: $1 +')')dnl +define(`PROLOGUE_list',m4_list_quote(m4_list_remove($1,PROLOGUE_list)))dnl +EPILOGUE_cpu(GSYM_PREFIX`$1')') + +dnl Currently open PROLOGUEs, as a comma-separated list. +define(PROLOGUE_list) + + +dnl Called: PROLOGUE_check(list,...) +dnl Check there's no remaining open PROLOGUEs at the end of input. +define(PROLOGUE_check, +`ifelse($1,,, +`m4_error(`no EPILOGUE for: $1 +')dnl +PROLOGUE_check(shift($@))')') + +m4wrap_prepend(`PROLOGUE_check(PROLOGUE_list)') + + +dnl Usage: PROLOGUE_current_function +dnl +dnl This macro expands to the current PROLOGUE/EPILOGUE function, or the +dnl most recent PROLOGUE if such pairs are nested or overlapped. + +define(PROLOGUE_current_function, +m4_assert_numargs(-1) +`m4_list_first(PROLOGUE_list)') + + +dnl Usage: PROLOGUE_cpu(GSYM_PREFIX`'foo[,param]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl These macros hold the CPU-specific parts of PROLOGUE and EPILOGUE. +dnl Both are called with the function name, with GSYM_PREFIX already +dnl prepended. +dnl +dnl The definitions here are something typical and sensible, but CPU or +dnl system specific m4 files should redefine them as necessary. The +dnl optional extra parameter to PROLOGUE_cpu is not expected and not +dnl accepted here. + +define(PROLOGUE_cpu, +m4_assert_numargs(1) +` TEXT + ALIGN(8) + GLOBL `$1' GLOBL_ATTR + TYPE(`$1',`function') +`$1'LABEL_SUFFIX') + +define(EPILOGUE_cpu, +` SIZE(`$1',.-`$1')') + + +dnl Usage: L(name) +dnl +dnl Generate a local label with the given name. This is simply a +dnl convenient way to add LSYM_PREFIX. +dnl +dnl LSYM_PREFIX might be L$, so defn() must be used to quote it or the L +dnl will expand again as the L macro, making an infinite recursion. + +define(`L', +m4_assert_numargs(1) +`defn(`LSYM_PREFIX')$1') + + +dnl Usage: LDEF(name) +dnl +dnl Generate a directive to define a local label. +dnl +dnl On systems with a fixed syntax for defining labels there's no need to +dnl use this macro, it's only meant for systems where the syntax varies, +dnl like hppa which is "L(foo):" with gas, but just "L(foo)" in column 0 +dnl with the system `as'. +dnl +dnl The extra `' after LABEL_SUFFIX avoids any chance of a following +dnl "(...)" being interpreted as an argument list. Not that it'd be +dnl sensible to write anything like that after an LDEF(), but just in case. + +define(LDEF, +m4_assert_numargs(1) +m4_assert_defined(`LABEL_SUFFIX') +`L(`$1')`'LABEL_SUFFIX`'') + + +dnl Usage: INT32(label,value) +dnl INT64(label,first,second) + +define(`INT32', +m4_assert_defined(`W32') +` ALIGN(4) +LDEF(`$1') + W32 $2') + +define(`INT64', +m4_assert_defined(`W32') +` ALIGN(8) +LDEF(`$1') + W32 $2 + W32 $3') + + +dnl Usage: ALIGN(bytes) +dnl +dnl Emit a ".align" directive. The alignment is specified in bytes, and +dnl will normally need to be a power of 2. The actual ".align" generated +dnl is either bytes or logarithmic according to what ./configure finds the +dnl assembler needs. +dnl +dnl If ALIGN_FILL_0x90 is defined and equal to "yes", then ", 0x90" is +dnl appended. This is for x86, see mpn/x86/README. + +define(ALIGN, +m4_assert_numargs(1) +m4_assert_defined(`ALIGN_LOGARITHMIC') +`.align ifelse(ALIGN_LOGARITHMIC,yes,`m4_log2($1)',`eval($1)')dnl +ifelse(ALIGN_FILL_0x90,yes,`, 0x90')') + + +dnl Usage: MULFUNC_PROLOGUE(function function...) +dnl +dnl A dummy macro which is grepped for by ./configure to know what +dnl functions a multi-function file is providing. Use this if there aren't +dnl explicit PROLOGUE()s for each possible function. +dnl +dnl Multiple MULFUNC_PROLOGUEs can be used, or just one with the function +dnl names separated by spaces. + +define(`MULFUNC_PROLOGUE', +m4_assert_numargs(1) +) + + +dnl Usage: NAILS_SUPPORT(spec spec ...) +dnl +dnl A dummy macro which is grepped for by ./configure to know what nails +dnl are supported in an asm file. +dnl +dnl Ranges can be given, or just individual values. Multiple values or +dnl ranges can be given, separated by spaces. Multiple NAILS_SUPPORT +dnl declarations work too. Some examples, +dnl +dnl NAILS_SUPPORT(1-20) +dnl NAILS_SUPPORT(1 6 9-12) +dnl NAILS_SUPPORT(1-10 16-20) + +define(NAILS_SUPPORT, +m4_assert_numargs(1) +) + + +dnl Usage: ABI_SUPPORT(abi) +dnl +dnl A dummy macro which is grepped for by ./configure to know what ABIs +dnl are supported in an asm file. +dnl +dnl If multiple non-standard ABIs are supported, several ABI_SUPPORT +dnl declarations should be used: +dnl +dnl ABI_SUPPORT(FOOABI) +dnl ABI_SUPPORT(BARABI) + +define(ABI_SUPPORT, +m4_assert_numargs(1) +) + + +dnl Usage: GMP_NUMB_MASK +dnl +dnl A bit mask for the number part of a limb. Eg. with 6 bit nails in a +dnl 32 bit limb, GMP_NUMB_MASK would be 0x3ffffff. + +define(GMP_NUMB_MASK, +m4_assert_numargs(-1) +m4_assert_defined(`GMP_NUMB_BITS') +`m4_hex_lowmask(GMP_NUMB_BITS)') + + +dnl Usage: m4append(`variable',`value-to-append') + +define(`m4append', +`define(`$1', defn(`$1')`$2') +' +) + +divert`'dnl diff --git a/gmp-6.3.0/mpn/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/bdiv_dbm1c.asm new file mode 120000 index 0000000..1159b34 --- /dev/null +++ b/gmp-6.3.0/mpn/bdiv_dbm1c.asm @@ -0,0 +1 @@ +../mpn/x86/bdiv_dbm1c.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/bdiv_q.c b/gmp-6.3.0/mpn/bdiv_q.c new file mode 120000 index 0000000..0bd49cf --- /dev/null +++ b/gmp-6.3.0/mpn/bdiv_q.c @@ -0,0 +1 @@ +../mpn/generic/bdiv_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/bdiv_q_1.asm b/gmp-6.3.0/mpn/bdiv_q_1.asm new file mode 120000 index 0000000..3bf59cc --- /dev/null +++ b/gmp-6.3.0/mpn/bdiv_q_1.asm @@ -0,0 +1 @@ +../mpn/x86/p6/bdiv_q_1.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/bdiv_qr.c b/gmp-6.3.0/mpn/bdiv_qr.c new file mode 120000 index 0000000..85bb9a2 --- /dev/null +++ b/gmp-6.3.0/mpn/bdiv_qr.c @@ -0,0 +1 @@ +../mpn/generic/bdiv_qr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/binvert.c b/gmp-6.3.0/mpn/binvert.c new file mode 120000 index 0000000..b4db943 --- /dev/null +++ b/gmp-6.3.0/mpn/binvert.c @@ -0,0 +1 @@ +../mpn/generic/binvert.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/broot.c b/gmp-6.3.0/mpn/broot.c new file mode 120000 index 0000000..ff09fa7 --- /dev/null +++ b/gmp-6.3.0/mpn/broot.c @@ -0,0 +1 @@ +../mpn/generic/broot.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/brootinv.c b/gmp-6.3.0/mpn/brootinv.c new file mode 120000 index 0000000..99a6ccf --- /dev/null +++ b/gmp-6.3.0/mpn/brootinv.c @@ -0,0 +1 @@ +../mpn/generic/brootinv.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/bsqrt.c b/gmp-6.3.0/mpn/bsqrt.c new file mode 120000 index 0000000..922ffc9 --- /dev/null +++ b/gmp-6.3.0/mpn/bsqrt.c @@ -0,0 +1 @@ +../mpn/generic/bsqrt.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/bsqrtinv.c b/gmp-6.3.0/mpn/bsqrtinv.c new file mode 120000 index 0000000..264bc22 --- /dev/null +++ b/gmp-6.3.0/mpn/bsqrtinv.c @@ -0,0 +1 @@ +../mpn/generic/bsqrtinv.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/cmp.c b/gmp-6.3.0/mpn/cmp.c new file mode 120000 index 0000000..6711a47 --- /dev/null +++ b/gmp-6.3.0/mpn/cmp.c @@ -0,0 +1 @@ +../mpn/generic/cmp.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/cnd_add_n.asm b/gmp-6.3.0/mpn/cnd_add_n.asm new file mode 120000 index 0000000..5f9d6fa --- /dev/null +++ b/gmp-6.3.0/mpn/cnd_add_n.asm @@ -0,0 +1 @@ +../mpn/x86/cnd_aors_n.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/cnd_sub_n.asm b/gmp-6.3.0/mpn/cnd_sub_n.asm new file mode 120000 index 0000000..5f9d6fa --- /dev/null +++ b/gmp-6.3.0/mpn/cnd_sub_n.asm @@ -0,0 +1 @@ +../mpn/x86/cnd_aors_n.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/cnd_swap.c b/gmp-6.3.0/mpn/cnd_swap.c new file mode 120000 index 0000000..bb1ccc1 --- /dev/null +++ b/gmp-6.3.0/mpn/cnd_swap.c @@ -0,0 +1 @@ +../mpn/generic/cnd_swap.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/com.c b/gmp-6.3.0/mpn/com.c new file mode 120000 index 0000000..7b89ebe --- /dev/null +++ b/gmp-6.3.0/mpn/com.c @@ -0,0 +1 @@ +../mpn/generic/com.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/comb_tables.c b/gmp-6.3.0/mpn/comb_tables.c new file mode 120000 index 0000000..48fcd1c --- /dev/null +++ b/gmp-6.3.0/mpn/comb_tables.c @@ -0,0 +1 @@ +../mpn/generic/comb_tables.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/compute_powtab.c b/gmp-6.3.0/mpn/compute_powtab.c new file mode 120000 index 0000000..64fabab --- /dev/null +++ b/gmp-6.3.0/mpn/compute_powtab.c @@ -0,0 +1 @@ +../mpn/generic/compute_powtab.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/copyd.asm b/gmp-6.3.0/mpn/copyd.asm new file mode 120000 index 0000000..07ab302 --- /dev/null +++ b/gmp-6.3.0/mpn/copyd.asm @@ -0,0 +1 @@ +../mpn/x86/p6/copyd.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/copyi.asm b/gmp-6.3.0/mpn/copyi.asm new file mode 120000 index 0000000..e061846 --- /dev/null +++ b/gmp-6.3.0/mpn/copyi.asm @@ -0,0 +1 @@ +../mpn/x86/copyi.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/cpp-ccas b/gmp-6.3.0/mpn/cpp-ccas new file mode 100755 index 0000000..25f7cdc --- /dev/null +++ b/gmp-6.3.0/mpn/cpp-ccas @@ -0,0 +1,118 @@ +#!/bin/sh +# +# A helper script for Makeasm.am .S.lo rule. + +# Copyright 2001 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: cpp-cc --cpp=CPP CC ... file.S ... +# +# Process file.S with the given CPP command plus any -D options in the +# rest of the arguments, then assemble with the given CC plus all +# arguments. +# +# The CPP command must be in a single --cpp= argument, and will be +# split on whitespace. It should include -I options required. +# +# When CC is invoked, file.S is replaced with a temporary .s file +# which is the CPP output. +# +# Any lines starting with "#" are removed from the CPP output, usually +# these will be #line and #file markers from CPP, but they might also +# be comments from the .S. +# +# To allow parallel builds, the temp file name is based on the .S file +# name, which will be the output object filename for all uses we put +# this script to. + +CPP= +CPPDEFS= +CC= +S= +SEEN_O=no + +for i in "$@"; do + case $i in + --cpp=*) + CPP=`echo "$i" | sed 's/^--cpp=//'` + ;; + -D*) + CPPDEFS="$CPPDEFS $i" + CC="$CC $i" + ;; + *.S) + if test -n "$S"; then + echo "Only one .S file permitted" + exit 1 + fi + BASENAME=`echo "$i" | sed -e 's/\.S$//' -e 's/^.*[\\/:]//'` + S=$i + TMP_I=tmp-$BASENAME.i + TMP_S=tmp-$BASENAME.s + CC="$CC $TMP_S" + ;; + -o) + SEEN_O=yes + CC="$CC $i" + ;; + *) + CC="$CC $i" + ;; + esac +done + +if test -z "$CPP"; then + echo "No --cpp specified" + exit 1 +fi + +if test -z "$S"; then + echo "No .S specified" + exit 1 +fi + +# Libtool adds it's own -o when sending output to .libs/foo.o, but not +# when just wanting foo.o in the current directory. We need an +# explicit -o in both cases since we're assembling tmp-foo.s. +# +if test $SEEN_O = no; then + CC="$CC -o $BASENAME.o" +fi + +echo "$CPP $CPPDEFS $S >$TMP_I" +$CPP $CPPDEFS $S >$TMP_I || exit + +echo "grep -v '^#' $TMP_I >$TMP_S" +grep -v '^#' $TMP_I >$TMP_S + +echo "$CC" +$CC || exit + +# Comment this out to preserve .s intermediates +rm -f $TMP diff --git a/gmp-6.3.0/mpn/cray/README b/gmp-6.3.0/mpn/cray/README new file mode 100644 index 0000000..3a347d2 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/README @@ -0,0 +1,121 @@ +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + +The code in this directory works for Cray vector systems such as C90, +J90, T90 (both the CFP variant and the IEEE variant) and SV1. (For +the T3E and T3D systems, see the `alpha' subdirectory at the same +level as the directory containing this file.) + +The cfp subdirectory is for systems utilizing the traditional Cray +floating-point format, and the ieee subdirectory is for the newer +systems that use the IEEE floating-point format. + +There are several issues that reduces speed on Cray systems. For +systems with cfp floating point, the main obstacle is the forming of +128-bit products. For IEEE systems, adding, and in particular +computing carry is the main issue. There are no vectorizing +unsigned-less-than instructions, and the sequence that implement that +operation is very long. + +Shifting is the only operation that is simple to make fast. All Cray +systems have a bitblt instructions (Vi Vj,VjAk) that +should be really useful. + +For best speed for cfp systems, we need a mul_basecase, since that +reduces the need for carry propagation to a minimum. Depending on the +size (vn) of the smaller of the two operands (V), we should split U and V +in different chunk sizes: + +U split in 2 32-bit parts +V split according to the table: +parts 4 5 6 7 8 +bits/part 16 13 11 10 8 +max allowed vn 1 8 32 64 256 +number of multiplies 8 10 12 14 16 +peak cycles/limb 4 5 6 7 8 + +U split in 3 22-bit parts +V split according to the table: +parts 3 4 5 +bits/part 22 16 13 +max allowed vn 16 1024 8192 +number of multiplies 9 12 15 +peak cycles/limb 4.5 6 7.5 + +U split in 4 16-bit parts +V split according to the table: +parts 4 +bits/part 16 +max allowed vn 65536 +number of multiplies 16 +peak cycles/limb 8 + +(A T90 CPU can accumulate two products per cycle.) + +IDEA: +* Rewrite mpn_add_n: + short cy[n + 1]; + #pragma _CRI ivdep + for (i = 0; i < n; i++) + { s = up[i] + vp[i]; + rp[i] = s; + cy[i + 1] = s < up[i]; } + more_carries = 0; + #pragma _CRI ivdep + for (i = 1; i < n; i++) + { s = rp[i] + cy[i]; + rp[i] = s; + more_carries += s < cy[i]; } + cys = 0; + if (more_carries) + { + cys = rp[1] < cy[1]; + for (i = 2; i < n; i++) + { rp[i] += cys; + cys = rp[i] < cys; } + } + return cys + cy[n]; + +* Write mpn_add3_n for adding three operands. First add operands 1 + and 2, and generate cy[]. Then add operand 3 to the partial result, + and accumulate carry into cy[]. Finally propagate carry just like + in the new mpn_add_n. + +IDEA: + +Store fewer bits, perhaps 62, per limb. That brings mpn_add_n time +down to 2.5 cycles/limb and mpn_addmul_1 times to 4 cycles/limb. By +storing even fewer bits per limb, perhaps 56, it would be possible to +write a mul_mul_basecase that would run at effectively 1 cycle/limb. +(Use VM here to better handle the romb-shaped multiply area, perhaps +rounding operand sizes up to the next power of 2.) diff --git a/gmp-6.3.0/mpn/cray/add_n.c b/gmp-6.3.0/mpn/cray/add_n.c new file mode 100644 index 0000000..af49159 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/add_n.c @@ -0,0 +1,90 @@ +/* Cray PVP mpn_add_n -- add two limb vectors and store their sum in a third + limb vector. + +Copyright 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at 4 cycles/limb. It may be possible to bring it down + to 3 cycles/limb. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, c0, c1; + mp_size_t i; + int more_carries; + + /* Main add loop. Generate a raw output sum in rp[] and a carry vector + in cy[]. */ +#pragma _CRI ivdep + for (i = 0; i < n; i++) + { + a = up[i]; + b = vp[i]; + s0 = a + b; + rp[i] = s0; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + cy[i] = c0; + } + /* Carry add loop. Add the carry vector cy[] to the raw sum rp[] and + store the new sum back to rp[0]. If this generates further carry, set + more_carries. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] is zero and cy[k-1] is non-zero. + These are where we got a recurrency carry. */ + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = (r == 0 && cy[i - 1] != 0); + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + return cyrec | cy[n - 1]; + } + + return cy[n - 1]; +} diff --git a/gmp-6.3.0/mpn/cray/cfp/addmul_1.c b/gmp-6.3.0/mpn/cray/cfp/addmul_1.c new file mode 100644 index 0000000..9c7f383 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/cfp/addmul_1.c @@ -0,0 +1,48 @@ +/* mpn_addmul_1 for Cray PVP. + +Copyright 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_limb_t p0[n], p1[n], tp[n]; + mp_limb_t cy_limb; + + GMPN_MULWW (p1, p0, up, &n, &limb); + cy_limb = mpn_add_n (tp, rp, p0, n); + rp[0] = tp[0]; + if (n != 1) + cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1); + cy_limb += p1[n - 1]; + + return cy_limb; +} diff --git a/gmp-6.3.0/mpn/cray/cfp/mul_1.c b/gmp-6.3.0/mpn/cray/cfp/mul_1.c new file mode 100644 index 0000000..33a6a05 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/cfp/mul_1.c @@ -0,0 +1,47 @@ +/* mpn_mul_1 for Cray PVP. + +Copyright 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_limb_t p0[n], p1[n]; + mp_limb_t cy_limb; + + GMPN_MULWW (p1, p0, up, &n, &limb); + rp[0] = p0[0]; + cy_limb = p1[n - 1]; + if (n != 1) + cy_limb += mpn_add_n (rp + 1, p0 + 1, p1, n - 1); + + return cy_limb; +} diff --git a/gmp-6.3.0/mpn/cray/cfp/mulwwc90.s b/gmp-6.3.0/mpn/cray/cfp/mulwwc90.s new file mode 100644 index 0000000..71d2285 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/cfp/mulwwc90.s @@ -0,0 +1,254 @@ +* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP. + +* Copyright 1996, 2000 Free Software Foundation, Inc. +* This file is generated from mulww.f in this same directory. + +* This file is part of the GNU MP Library. +* +* The GNU MP Library is free software; you can redistribute it and/or modify +* it under the terms of either: +* +* * the GNU Lesser General Public License as published by the Free +* Software Foundation; either version 3 of the License, or (at your +* option) any later version. +* +* or +* +* * the GNU General Public License as published by the Free Software +* Foundation; either version 2 of the License, or (at your option) any +* later version. +* +* or both in parallel, as here. +* +* The GNU MP Library is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* for more details. +* +* You should have received copies of the GNU General Public License and the +* GNU Lesser General Public License along with the GNU MP Library. If not, +* see https://www.gnu.org/licenses/. + + IDENT GMPN_MULWW +********************************************** +* Assemble with Cal Version 2.0 * +* * +* Generated by CFT77 6.0.4.19 * +* on 06/27/00 at 04:34:13 * +* * +********************************************** +* ALLOW UNDERSCORES IN IDENTIFIERS + EDIT OFF + FORMAT NEW +@DATA SECTION DATA,CM +@DATA = W.* + CON O'0000000000040000000000 + CON O'0435152404713723252514 + CON O'0535270000000000000000 + CON O'0000000000000001200012 + VWD 32/0,32/P.GMPN_MULWW + CON O'0014003000000000001416 + CON O'0000000000000000000011 + CON O'0000000000000000000215 + BSSZ 1 +@CODE SECTION CODE +@CODE = P.* +L3 = P.* + A0 A6 + A5 6 + B03,A5 0,A0 + A0 A1+A2 + A5 1 + 0,A0 T00,A5 + B02 A2 + B66 A3 + B01 A6 + A7 P.L4 + B00 A7 + A6 @DATA + J $STKOFEN +GMPN_MULWW = P.* + A0 @DATA+3 + B77 A0 + A1 13 + A0 B66 + A2 B66 + A4 B67 + 0,A0 B77,A1 + A7 782 + A3 A2+A7 + A0 A4-A3 + JAM L3 + A0 A6 + A5 6 + B03,A5 0,A0 + A0 A1+A2 + A5 1 + 0,A0 T00,A5 + B02 A2 + B66 A3 + B01 A6 +L4 = P.* + A7 B07 + S7 0,A7 + A6 B10 + S6 0,A6 + S5 1 + S4 <22 + S7 S7-S5 + S5 #S7 + T00 S6 + S6 S6>22 + S7 T00 + S7 S7>44 + S3 T00 + S3 S3&S4 + S6 S6&S4 + S7 S7&S4 + S3 S3<24 + S6 S6<24 + S7 S7<24 + S0 S5 + S4 S5 + S1 S6 + S2 S3 + S3 S7 + JSP L5 +L6 = P.* + S7 -S4 + A2 S7 + VL A2 + A3 B06 + A5 B05 + A4 B04 + A1 VL + A2 S4 +L7 = P.* + A0 A3 + VL A1 + V7 ,A0,1 + B11 A5 + A7 22 + B12 A4 + V6 V7>A7 + B13 A3 + S7 <22 + A3 B02 + V5 S7&V6 + A6 24 + V4 V5A5 + V2 S1*FV1 + V3 S7&V5 + A0 14 + B77 A0 + A4 B77 + A0 A4+A3 + ,A0,1 V2 + V0 V3A7 + V2 S2*FV0 + V3 V6+V2 + S7 <20 + V1 S7&V3 + A4 270 + A0 A4+A3 + ,A0,1 V0 + A4 14 + A0 A4+A3 + V7 ,A0,1 + V6 V1A5 + V0 S1*FV4 + A5 654 + A0 A5+A3 + ,A0,1 V1 + V6 V7+V0 + A5 2 + V2 V6A6 + A5 654 + CPW + A0 A5+A3 + V1 ,A0,1 + A5 398 + A0 A5+A3 + V3 ,A0,1 + V6 V4+V1 + V2 V3>A6 + V5 V6+V2 + A6 B12 + V4 V322 + S7 T00 + S7 S7>44 + S3 T00 + S3 S3&S4 + S6 S6&S4 + S7 S7&S4 + S3 S3<24 + S6 S6<24 + S7 S7<24 + S0 S5 + S4 S5 + S1 S6 + S2 S3 + S3 S7 + JSP L5 +L6 = P.* + S7 -S4 + A2 S7 + VL A2 + A3 B06 + A5 B05 + A4 B04 + A1 VL + A2 S4 +L7 = P.* + A0 A3 + VL A1 + V7 ,A0,1 + B11 A5 + A7 22 + B12 A4 + V6 V7>A7 + B13 A3 + S7 <22 + A3 B02 + V5 S7&V6 + A6 24 + V4 V5A5 + V2 S1*FV1 + V3 S7&V5 + A0 14 + B77 A0 + A4 B77 + A0 A4+A3 + ,A0,1 V2 + V0 V3A7 + V2 S2*FV0 + V3 V6+V2 + S7 <20 + V1 S7&V3 + A4 270 + A0 A4+A3 + ,A0,1 V0 + A4 14 + A0 A4+A3 + V7 ,A0,1 + V6 V1A5 + V0 S1*FV4 + A5 654 + A0 A5+A3 + ,A0,1 V1 + V6 V7+V0 + A5 2 + V2 V6A6 + A5 654 + A0 A5+A3 + V1 ,A0,1 + A5 398 + A0 A5+A3 + V3 ,A0,1 + V6 V4+V1 + V2 V3>A6 + V5 V6+V2 + A6 B12 + V4 V3 +#include "gmp-impl.h" + +unsigned long int +mpn_hamdist (mp_srcptr p1, mp_srcptr p2, mp_size_t n) +{ + unsigned long int result = 0; + mp_size_t i; + for (i = 0; i < n; i++) + result += _popcnt (p1[i] ^ p2[i]); + return result; +} diff --git a/gmp-6.3.0/mpn/cray/ieee/addmul_1.c b/gmp-6.3.0/mpn/cray/ieee/addmul_1.c new file mode 100644 index 0000000..ce7dfbb --- /dev/null +++ b/gmp-6.3.0/mpn/cray/ieee/addmul_1.c @@ -0,0 +1,111 @@ +/* Cray PVP/IEEE mpn_addmul_1 -- multiply a limb vector with a limb and add the + result to a second limb vector. + +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at just under 9 cycles/limb on a T90. That is not perfect, + mainly due to vector register shortage in the main loop. Assembly code + should bring it down to perhaps 7 cycles/limb. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i; + int more_carries; + + if (up == rp) + { + /* The algorithm used below cannot handle overlap. Handle it here by + making a temporary copy of the source vector, then call ourselves. */ + mp_limb_t xp[n]; + MPN_COPY (xp, up, n); + return mpn_addmul_1 (rp, xp, n, vl); + } + + a = up[0] * vl; + r = rp[0]; + s0 = a + r; + rp[0] = s0; + c0 = ((a & r) | ((a | r) & ~s0)) >> 63; + cy[0] = c0; + + /* Main multiply loop. Generate a raw accumulated output product in rp[] + and a carry vector in cy[]. */ +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + a = up[i] * vl; + b = _int_mult_upper (up[i - 1], vl); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + r = rp[i]; + s1 = s0 + r; + rp[i] = s1; + c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63; + cy[i] = c0 + c1; + } + /* Carry add loop. Add the carry vector cy[] to the raw result rp[] and + store the new result back to rp[]. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] == 0 and cy[k-1] == 1 or + rp[k] == 1 and cy[k-1] == 2. + These are where we got a recurrency carry. */ + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = r < cy[i - 1]; + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1]; + } + + return _int_mult_upper (up[n - 1], vl) + cy[n - 1]; +} diff --git a/gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h b/gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h new file mode 100644 index 0000000..1fdc286 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/ieee/gmp-mparam.h @@ -0,0 +1,73 @@ +/* Cray T90 IEEE gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1996, 2000-2002, 2004 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Generated by tuneup.c, 2004-02-07, system compiler */ + +#define MUL_TOOM22_THRESHOLD 130 +#define MUL_TOOM33_THRESHOLD 260 + +#define SQR_BASECASE_THRESHOLD 9 /* karatsuba */ +#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */ +#define SQR_TOOM3_THRESHOLD 34 + +#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */ +#define DIV_DC_THRESHOLD 390 +#define POWM_THRESHOLD 656 + +#define HGCD_THRESHOLD 964 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 964 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define USE_PREINV_MOD_1 1 /* preinv always */ +#define DIVREM_2_THRESHOLD 0 /* preinv always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 45 +#define GET_STR_PRECOMPUTE_THRESHOLD 77 +#define SET_STR_THRESHOLD 145756 + +#define MUL_FFT_TABLE { 1104, 2208, 4416, 8960, 19456, 45056, 0 } +#define MUL_FFT_MODF_THRESHOLD 1168 +#define MUL_FFT_THRESHOLD 6528 + +#define SQR_FFT_TABLE { 368, 736, 1600, 2816, 7168, 12288, 0 } +#define SQR_FFT_MODF_THRESHOLD 296 +#define SQR_FFT_THRESHOLD 1312 diff --git a/gmp-6.3.0/mpn/cray/ieee/invert_limb.c b/gmp-6.3.0/mpn/cray/ieee/invert_limb.c new file mode 100644 index 0000000..774a27b --- /dev/null +++ b/gmp-6.3.0/mpn/cray/ieee/invert_limb.c @@ -0,0 +1,127 @@ +/* mpn_invert_limb -- Invert a normalized limb. + +Copyright 1991, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* + This is needed to make configure define HAVE_NATIVE_mpn_invert_limb: + PROLOGUE(mpn_invert_limb) +*/ + +static const unsigned short int approx_tab[0x100] = +{ + /* 0x400, */ + 0x3ff, + 0x3fc, 0x3f8, 0x3f4, 0x3f0, 0x3ec, 0x3e8, 0x3e4, + 0x3e0, 0x3dd, 0x3d9, 0x3d5, 0x3d2, 0x3ce, 0x3ca, 0x3c7, + 0x3c3, 0x3c0, 0x3bc, 0x3b9, 0x3b5, 0x3b2, 0x3ae, 0x3ab, + 0x3a8, 0x3a4, 0x3a1, 0x39e, 0x39b, 0x397, 0x394, 0x391, + 0x38e, 0x38b, 0x387, 0x384, 0x381, 0x37e, 0x37b, 0x378, + 0x375, 0x372, 0x36f, 0x36c, 0x369, 0x366, 0x364, 0x361, + 0x35e, 0x35b, 0x358, 0x355, 0x353, 0x350, 0x34d, 0x34a, + 0x348, 0x345, 0x342, 0x340, 0x33d, 0x33a, 0x338, 0x335, + 0x333, 0x330, 0x32e, 0x32b, 0x329, 0x326, 0x324, 0x321, + 0x31f, 0x31c, 0x31a, 0x317, 0x315, 0x313, 0x310, 0x30e, + 0x30c, 0x309, 0x307, 0x305, 0x303, 0x300, 0x2fe, 0x2fc, + 0x2fa, 0x2f7, 0x2f5, 0x2f3, 0x2f1, 0x2ef, 0x2ec, 0x2ea, + 0x2e8, 0x2e6, 0x2e4, 0x2e2, 0x2e0, 0x2de, 0x2dc, 0x2da, + 0x2d8, 0x2d6, 0x2d4, 0x2d2, 0x2d0, 0x2ce, 0x2cc, 0x2ca, + 0x2c8, 0x2c6, 0x2c4, 0x2c2, 0x2c0, 0x2be, 0x2bc, 0x2bb, + 0x2b9, 0x2b7, 0x2b5, 0x2b3, 0x2b1, 0x2b0, 0x2ae, 0x2ac, + 0x2aa, 0x2a8, 0x2a7, 0x2a5, 0x2a3, 0x2a1, 0x2a0, 0x29e, + 0x29c, 0x29b, 0x299, 0x297, 0x295, 0x294, 0x292, 0x291, + 0x28f, 0x28d, 0x28c, 0x28a, 0x288, 0x287, 0x285, 0x284, + 0x282, 0x280, 0x27f, 0x27d, 0x27c, 0x27a, 0x279, 0x277, + 0x276, 0x274, 0x273, 0x271, 0x270, 0x26e, 0x26d, 0x26b, + 0x26a, 0x268, 0x267, 0x265, 0x264, 0x263, 0x261, 0x260, + 0x25e, 0x25d, 0x25c, 0x25a, 0x259, 0x257, 0x256, 0x255, + 0x253, 0x252, 0x251, 0x24f, 0x24e, 0x24d, 0x24b, 0x24a, + 0x249, 0x247, 0x246, 0x245, 0x243, 0x242, 0x241, 0x240, + 0x23e, 0x23d, 0x23c, 0x23b, 0x239, 0x238, 0x237, 0x236, + 0x234, 0x233, 0x232, 0x231, 0x230, 0x22e, 0x22d, 0x22c, + 0x22b, 0x22a, 0x229, 0x227, 0x226, 0x225, 0x224, 0x223, + 0x222, 0x220, 0x21f, 0x21e, 0x21d, 0x21c, 0x21b, 0x21a, + 0x219, 0x218, 0x216, 0x215, 0x214, 0x213, 0x212, 0x211, + 0x210, 0x20f, 0x20e, 0x20d, 0x20c, 0x20b, 0x20a, 0x209, + 0x208, 0x207, 0x206, 0x205, 0x204, 0x203, 0x202, 0x201, +}; + +/* iteration: z = 2z-(z**2)d */ + +mp_limb_t +mpn_invert_limb (mp_limb_t d) +{ + mp_limb_t z, z2l, z2h, tl, th; + mp_limb_t xh, xl; + mp_limb_t zh, zl; + +#if GMP_LIMB_BITS == 32 + z = approx_tab[(d >> 23) - 0x100] << 6; /* z < 2^16 */ + + z2l = z * z; /* z2l < 2^32 */ + umul_ppmm (th, tl, z2l, d); + z = (z << 17) - (th << 1); +#endif +#if GMP_LIMB_BITS == 64 + z = approx_tab[(d >> 55) - 0x100] << 6; /* z < 2^16 */ + + z2l = z * z; /* z2l < 2^32 */ + th = z2l * (d >> 32); /* th < 2^64 */ + z = (z << 17) - (th >> 31); /* z < 2^32 */ + + z2l = z * z; + umul_ppmm (th, tl, z2l, d); + z = (z << 33) - (th << 1); +#endif + + umul_ppmm (z2h, z2l, z, z); + umul_ppmm (th, tl, z2h, d); + umul_ppmm (xh, xl, z2l, d); + tl += xh; + th += tl < xh; + th = (th << 2) | (tl >> GMP_LIMB_BITS - 2); + tl = tl << 2; + sub_ddmmss (zh, zl, z << 2, 0, th, tl); + + umul_ppmm (xh, xl, d, zh); + xh += d; /* add_ssaaaa (xh, xl, xh, xl, d, 0); */ + if (~xh != 0) + { + add_ssaaaa (xh, xl, xh, xl, 0, d); + zh++; + } + + add_ssaaaa (xh, xl, xh, xl, 0, d); + if (xh != 0) + zh++; + + return zh; +} diff --git a/gmp-6.3.0/mpn/cray/ieee/mul_1.c b/gmp-6.3.0/mpn/cray/ieee/mul_1.c new file mode 100644 index 0000000..40139fb --- /dev/null +++ b/gmp-6.3.0/mpn/cray/ieee/mul_1.c @@ -0,0 +1,103 @@ +/* Cray PVP/IEEE mpn_mul_1 -- multiply a limb vector with a limb and store the + result in a second limb vector. + +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at 5 cycles/limb on a T90. That would probably + be hard to improve upon, even with assembly code. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i; + int more_carries; + + if (up == rp) + { + /* The algorithm used below cannot handle overlap. Handle it here by + making a temporary copy of the source vector, then call ourselves. */ + mp_limb_t xp[n]; + MPN_COPY (xp, up, n); + return mpn_mul_1 (rp, xp, n, vl); + } + + a = up[0] * vl; + rp[0] = a; + cy[0] = 0; + + /* Main multiply loop. Generate a raw accumulated output product in rp[] + and a carry vector in cy[]. */ +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + a = up[i] * vl; + b = _int_mult_upper (up[i - 1], vl); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + rp[i] = s0; + cy[i] = c0; + } + /* Carry add loop. Add the carry vector cy[] to the raw sum rp[] and + store the new sum back to rp[0]. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 2; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] is zero and cy[k-1] is non-zero. + These are where we got a recurrency carry. */ + for (i = 2; i < n; i++) + { + r = rp[i]; + c0 = (r == 0 && cy[i - 1] != 0); + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1]; + } + + return _int_mult_upper (up[n - 1], vl) + cy[n - 1]; +} diff --git a/gmp-6.3.0/mpn/cray/ieee/mul_basecase.c b/gmp-6.3.0/mpn/cray/ieee/mul_basecase.c new file mode 100644 index 0000000..72628f7 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/ieee/mul_basecase.c @@ -0,0 +1,107 @@ +/* Cray PVP/IEEE mpn_mul_basecase. + +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* The most critical loop of this code runs at about 5 cycles/limb on a T90. + That is not perfect, mainly due to vector register shortage. */ + +#include +#include "gmp-impl.h" + +void +mpn_mul_basecase (mp_ptr rp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +{ + mp_limb_t cy[un + vn]; + mp_limb_t vl; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i, j; + int more_carries; + + for (i = 0; i < un + vn; i++) + { + rp[i] = 0; + cy[i] = 0; + } + +#pragma _CRI novector + for (j = 0; j < vn; j++) + { + vl = vp[j]; + + a = up[0] * vl; + r = rp[j]; + s0 = a + r; + rp[j] = s0; + c0 = ((a & r) | ((a | r) & ~s0)) >> 63; + cy[j] += c0; + +#pragma _CRI ivdep + for (i = 1; i < un; i++) + { + a = up[i] * vl; + b = _int_mult_upper (up[i - 1], vl); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + r = rp[j + i]; + s1 = s0 + r; + rp[j + i] = s1; + c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63; + cy[j + i] += c0 + c1; + } + rp[j + un] = _int_mult_upper (up[un - 1], vl); + } + + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < un + vn; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + for (i = 1; i < un + vn; i++) + { + r = rp[i]; + c0 = (r < cy[i - 1]); + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + } +} diff --git a/gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c b/gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c new file mode 100644 index 0000000..5bd4e56 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/ieee/sqr_basecase.c @@ -0,0 +1,105 @@ +/* Cray PVP/IEEE mpn_sqr_basecase. + +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This is just mpn_mul_basecase with trivial modifications. */ + +#include +#include "gmp-impl.h" + +void +mpn_sqr_basecase (mp_ptr rp, + mp_srcptr up, mp_size_t un) +{ + mp_limb_t cy[un + un]; + mp_limb_t ul; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i, j; + int more_carries; + + for (i = 0; i < un + un; i++) + { + rp[i] = 0; + cy[i] = 0; + } + +#pragma _CRI novector + for (j = 0; j < un; j++) + { + ul = up[j]; + + a = up[0] * ul; + r = rp[j]; + s0 = a + r; + rp[j] = s0; + c0 = ((a & r) | ((a | r) & ~s0)) >> 63; + cy[j] += c0; + +#pragma _CRI ivdep + for (i = 1; i < un; i++) + { + a = up[i] * ul; + b = _int_mult_upper (up[i - 1], ul); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + r = rp[j + i]; + s1 = s0 + r; + rp[j + i] = s1; + c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63; + cy[j + i] += c0 + c1; + } + rp[j + un] = _int_mult_upper (up[un - 1], ul); + } + + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < un + un; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r + c0; + rp[i] = s0; + c0 = (r & ~s0) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + for (i = 1; i < un + un; i++) + { + r = rp[i]; + c0 = (r < cy[i - 1]); + s0 = r + cyrec; + rp[i] = s0; + c1 = (r & ~s0) >> 63; + cyrec = c0 | c1; + } + } +} diff --git a/gmp-6.3.0/mpn/cray/ieee/submul_1.c b/gmp-6.3.0/mpn/cray/ieee/submul_1.c new file mode 100644 index 0000000..2b3ca21 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/ieee/submul_1.c @@ -0,0 +1,111 @@ +/* Cray PVP/IEEE mpn_submul_1 -- multiply a limb vector with a limb and + subtract the result from a second limb vector. + +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at just under 9 cycles/limb on a T90. That is not perfect, + mainly due to vector register shortage in the main loop. Assembly code + should bring it down to perhaps 7 cycles/limb. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, s1, c0, c1; + mp_size_t i; + int more_carries; + + if (up == rp) + { + /* The algorithm used below cannot handle overlap. Handle it here by + making a temporary copy of the source vector, then call ourselves. */ + mp_limb_t xp[n]; + MPN_COPY (xp, up, n); + return mpn_submul_1 (rp, xp, n, vl); + } + + a = up[0] * vl; + r = rp[0]; + s0 = r - a; + rp[0] = s0; + c1 = ((s0 & a) | ((s0 | a) & ~r)) >> 63; + cy[0] = c1; + + /* Main multiply loop. Generate a raw accumulated output product in rp[] + and a carry vector in cy[]. */ +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + a = up[i] * vl; + b = _int_mult_upper (up[i - 1], vl); + s0 = a + b; + c0 = ((a & b) | ((a | b) & ~s0)) >> 63; + r = rp[i]; + s1 = r - s0; + rp[i] = s1; + c1 = ((s1 & s0) | ((s1 | s0) & ~r)) >> 63; + cy[i] = c0 + c1; + } + /* Carry subtract loop. Subtract the carry vector cy[] from the raw result + rp[] and store the new result back to rp[]. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r - c0; + rp[i] = s0; + c0 = (s0 & ~r) >> 63; + more_carries += c0; + } + /* If that second loop generated carry, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] == ~0 and cy[k-1] == 1 or + rp[k] == ~1 and cy[k-1] == 2. + These are where we got a recurrency carry. */ + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = ~r < cy[i - 1]; + s0 = r - cyrec; + rp[i] = s0; + c1 = (s0 & ~r) >> 63; + cyrec = c0 | c1; + } + return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1]; + } + + return _int_mult_upper (up[n - 1], vl) + cy[n - 1]; +} diff --git a/gmp-6.3.0/mpn/cray/lshift.c b/gmp-6.3.0/mpn/cray/lshift.c new file mode 100644 index 0000000..8534e93 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/lshift.c @@ -0,0 +1,58 @@ +/* mpn_lshift -- Shift left low level for Cray vector processors. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_lshift (mp_ptr wp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + unsigned sh_1, sh_2; + mp_size_t i; + mp_limb_t retval; + + sh_1 = cnt; + sh_2 = GMP_LIMB_BITS - sh_1; + retval = up[n - 1] >> sh_2; + +#pragma _CRI ivdep + for (i = n - 1; i > 0; i--) + { +#if 1 + wp[i] = (up[i] << sh_1) | (up[i - 1] >> sh_2); +#else + /* This is the recommended way, but at least on SV1 it is slower. */ + wp[i] = _dshiftl (up[i], up[i - 1], sh_1); +#endif + } + + wp[0] = up[0] << sh_1; + return retval; +} diff --git a/gmp-6.3.0/mpn/cray/mulww.f b/gmp-6.3.0/mpn/cray/mulww.f new file mode 100644 index 0000000..6885dfc --- /dev/null +++ b/gmp-6.3.0/mpn/cray/mulww.f @@ -0,0 +1,63 @@ +c Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP. + +c Copyright 1996, 2000 Free Software Foundation, Inc. + +c This file is part of the GNU MP Library. +c +c The GNU MP Library is free software; you can redistribute it and/or modify +c it under the terms of either: +c +c * the GNU Lesser General Public License as published by the Free +c Software Foundation; either version 3 of the License, or (at your +c option) any later version. +c +c or +c +c * the GNU General Public License as published by the Free Software +c Foundation; either version 2 of the License, or (at your option) any +c later version. +c +c or both in parallel, as here. +c +c The GNU MP Library is distributed in the hope that it will be useful, but +c WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +c or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +c for more details. +c +c You should have received copies of the GNU General Public License and the +c GNU Lesser General Public License along with the GNU MP Library. If not, +c see https://www.gnu.org/licenses/. + +c p1[] = hi(a[]*s); the upper limbs of each product +c p0[] = low(a[]*s); the corresponding lower limbs +c n is number of limbs in the vectors + + subroutine gmpn_mulww(p1,p0,a,n,s) + integer*8 p1(0:*),p0(0:*),a(0:*),s + integer n + + integer*8 a0,a1,a2,s0,s1,s2,c + integer*8 ai,t0,t1,t2,t3,t4 + + s0 = shiftl(and(s,4194303),24) + s1 = shiftl(and(shiftr(s,22),4194303),24) + s2 = shiftl(and(shiftr(s,44),4194303),24) + + do i = 0,n-1 + ai = a(i) + a0 = shiftl(and(ai,4194303),24) + a1 = shiftl(and(shiftr(ai,22),4194303),24) + a2 = shiftl(and(shiftr(ai,44),4194303),24) + + t0 = i24mult(a0,s0) + t1 = i24mult(a0,s1)+i24mult(a1,s0) + t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0) + t3 = i24mult(a1,s2)+i24mult(a2,s1) + t4 = i24mult(a2,s2) + + p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0 + c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+ + $ shiftl(and(t2,1048575),22),42) + p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c + end do + end diff --git a/gmp-6.3.0/mpn/cray/popcount.c b/gmp-6.3.0/mpn/cray/popcount.c new file mode 100644 index 0000000..a79211f --- /dev/null +++ b/gmp-6.3.0/mpn/cray/popcount.c @@ -0,0 +1,42 @@ +/* Cray mpn_popcount -- population count. + +Copyright 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +unsigned long int +mpn_popcount (mp_srcptr p, mp_size_t n) +{ + unsigned long int result = 0; + mp_size_t i; + for (i = 0; i < n; i++) + result += _popcnt (p[i]); + return result; +} diff --git a/gmp-6.3.0/mpn/cray/rshift.c b/gmp-6.3.0/mpn/cray/rshift.c new file mode 100644 index 0000000..9c4aa22 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/rshift.c @@ -0,0 +1,58 @@ +/* mpn_rshift -- Shift right low level for Cray vector processors. + +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +mp_limb_t +mpn_rshift (mp_ptr wp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + unsigned sh_1, sh_2; + mp_size_t i; + mp_limb_t retval; + + sh_1 = cnt; + sh_2 = GMP_LIMB_BITS - sh_1; + retval = up[0] << sh_2; + +#pragma _CRI ivdep + for (i = 0; i < n - 1; i++) + { +#if 1 + wp[i] = (up[i] >> sh_1) | (up[i + 1] << sh_2); +#else + /* This is the recommended way, but at least on SV1 it is slower. */ + wp[i] = _dshiftr (up[i + 1], up[i], sh_1); +#endif + } + + wp[n - 1] = up[n - 1] >> sh_1; + return retval; +} diff --git a/gmp-6.3.0/mpn/cray/sub_n.c b/gmp-6.3.0/mpn/cray/sub_n.c new file mode 100644 index 0000000..f518764 --- /dev/null +++ b/gmp-6.3.0/mpn/cray/sub_n.c @@ -0,0 +1,90 @@ +/* Cray PVP mpn_sub_n -- subtract two limb vectors and store their difference + in a third limb vector. + +Copyright 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* This code runs at 4 cycles/limb. It may be possible to bring it down + to 3 cycles/limb. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t cy[n]; + mp_limb_t a, b, r, s0, c0, c1; + mp_size_t i; + int more_carries; + + /* Main subtract loop. Generate a raw output difference in rp[] and a + borrow vector in cy[]. */ +#pragma _CRI ivdep + for (i = 0; i < n; i++) + { + a = up[i]; + b = vp[i]; + s0 = a - b; /* a = s0 + b */ + rp[i] = s0; + c0 = ((s0 & b) | ((s0 | b) & ~a)) >> 63; + cy[i] = c0; + } + /* Borrow subtract loop. Subtract the borrow vector cy[] from the raw + difference rp[] and store the new difference back to rp[0]. If this + generates further borrow, set more_carries. */ + more_carries = 0; +#pragma _CRI ivdep + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = cy[i - 1]; + s0 = r - c0; /* r = s0 + c0 */ + rp[i] = s0; + c0 = (s0 & ~r) >> 63; + more_carries += c0; + } + /* If that second loop generated borrow, handle that in scalar loop. */ + if (more_carries) + { + mp_limb_t cyrec = 0; + /* Look for places where rp[k] contains just ones and cy[k-1] is + non-zero. These are where we got a recurrency borrow. */ + for (i = 1; i < n; i++) + { + r = rp[i]; + c0 = (~r == 0 && cy[i - 1] != 0); + s0 = r - cyrec; + rp[i] = s0; + c1 = (s0 & ~r) >> 63; + cyrec = c0 | c1; + } + return cyrec | cy[n - 1]; + } + + return cy[n - 1]; +} diff --git a/gmp-6.3.0/mpn/dcpi1_bdiv_q.c b/gmp-6.3.0/mpn/dcpi1_bdiv_q.c new file mode 120000 index 0000000..8bbde35 --- /dev/null +++ b/gmp-6.3.0/mpn/dcpi1_bdiv_q.c @@ -0,0 +1 @@ +../mpn/generic/dcpi1_bdiv_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/dcpi1_bdiv_qr.c b/gmp-6.3.0/mpn/dcpi1_bdiv_qr.c new file mode 120000 index 0000000..d8aa9f1 --- /dev/null +++ b/gmp-6.3.0/mpn/dcpi1_bdiv_qr.c @@ -0,0 +1 @@ +../mpn/generic/dcpi1_bdiv_qr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/dcpi1_div_q.c b/gmp-6.3.0/mpn/dcpi1_div_q.c new file mode 120000 index 0000000..e477a59 --- /dev/null +++ b/gmp-6.3.0/mpn/dcpi1_div_q.c @@ -0,0 +1 @@ +../mpn/generic/dcpi1_div_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/dcpi1_div_qr.c b/gmp-6.3.0/mpn/dcpi1_div_qr.c new file mode 120000 index 0000000..6510ada --- /dev/null +++ b/gmp-6.3.0/mpn/dcpi1_div_qr.c @@ -0,0 +1 @@ +../mpn/generic/dcpi1_div_qr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/dcpi1_divappr_q.c b/gmp-6.3.0/mpn/dcpi1_divappr_q.c new file mode 120000 index 0000000..632a409 --- /dev/null +++ b/gmp-6.3.0/mpn/dcpi1_divappr_q.c @@ -0,0 +1 @@ +../mpn/generic/dcpi1_divappr_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/div_q.c b/gmp-6.3.0/mpn/div_q.c new file mode 120000 index 0000000..008b6ee --- /dev/null +++ b/gmp-6.3.0/mpn/div_q.c @@ -0,0 +1 @@ +../mpn/generic/div_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/div_qr_1.c b/gmp-6.3.0/mpn/div_qr_1.c new file mode 120000 index 0000000..2694c97 --- /dev/null +++ b/gmp-6.3.0/mpn/div_qr_1.c @@ -0,0 +1 @@ +../mpn/generic/div_qr_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/div_qr_1n_pi1.c b/gmp-6.3.0/mpn/div_qr_1n_pi1.c new file mode 120000 index 0000000..6e202fc --- /dev/null +++ b/gmp-6.3.0/mpn/div_qr_1n_pi1.c @@ -0,0 +1 @@ +../mpn/generic/div_qr_1n_pi1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/div_qr_2.c b/gmp-6.3.0/mpn/div_qr_2.c new file mode 120000 index 0000000..0ae57a0 --- /dev/null +++ b/gmp-6.3.0/mpn/div_qr_2.c @@ -0,0 +1 @@ +../mpn/generic/div_qr_2.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/div_qr_2n_pi1.c b/gmp-6.3.0/mpn/div_qr_2n_pi1.c new file mode 120000 index 0000000..4b8033d --- /dev/null +++ b/gmp-6.3.0/mpn/div_qr_2n_pi1.c @@ -0,0 +1 @@ +../mpn/generic/div_qr_2n_pi1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/div_qr_2u_pi1.c b/gmp-6.3.0/mpn/div_qr_2u_pi1.c new file mode 120000 index 0000000..ec90217 --- /dev/null +++ b/gmp-6.3.0/mpn/div_qr_2u_pi1.c @@ -0,0 +1 @@ +../mpn/generic/div_qr_2u_pi1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/dive_1.asm b/gmp-6.3.0/mpn/dive_1.asm new file mode 120000 index 0000000..8dd0782 --- /dev/null +++ b/gmp-6.3.0/mpn/dive_1.asm @@ -0,0 +1 @@ +../mpn/x86/p6/dive_1.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/diveby3.c b/gmp-6.3.0/mpn/diveby3.c new file mode 120000 index 0000000..16f7d3a --- /dev/null +++ b/gmp-6.3.0/mpn/diveby3.c @@ -0,0 +1 @@ +../mpn/generic/diveby3.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/divexact.c b/gmp-6.3.0/mpn/divexact.c new file mode 120000 index 0000000..dc1c15b --- /dev/null +++ b/gmp-6.3.0/mpn/divexact.c @@ -0,0 +1 @@ +../mpn/generic/divexact.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/divis.c b/gmp-6.3.0/mpn/divis.c new file mode 120000 index 0000000..c76adc0 --- /dev/null +++ b/gmp-6.3.0/mpn/divis.c @@ -0,0 +1 @@ +../mpn/generic/divis.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/divrem.c b/gmp-6.3.0/mpn/divrem.c new file mode 120000 index 0000000..b877ad3 --- /dev/null +++ b/gmp-6.3.0/mpn/divrem.c @@ -0,0 +1 @@ +../mpn/generic/divrem.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/divrem_1.asm b/gmp-6.3.0/mpn/divrem_1.asm new file mode 120000 index 0000000..dce2548 --- /dev/null +++ b/gmp-6.3.0/mpn/divrem_1.asm @@ -0,0 +1 @@ +../mpn/x86/p6/mmx/divrem_1.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/divrem_2.asm b/gmp-6.3.0/mpn/divrem_2.asm new file mode 120000 index 0000000..1811aac --- /dev/null +++ b/gmp-6.3.0/mpn/divrem_2.asm @@ -0,0 +1 @@ +../mpn/x86/divrem_2.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/dump.c b/gmp-6.3.0/mpn/dump.c new file mode 120000 index 0000000..3670bf4 --- /dev/null +++ b/gmp-6.3.0/mpn/dump.c @@ -0,0 +1 @@ +../mpn/generic/dump.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/fib2_ui.c b/gmp-6.3.0/mpn/fib2_ui.c new file mode 120000 index 0000000..67df9e1 --- /dev/null +++ b/gmp-6.3.0/mpn/fib2_ui.c @@ -0,0 +1 @@ +../mpn/generic/fib2_ui.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/fib2m.c b/gmp-6.3.0/mpn/fib2m.c new file mode 120000 index 0000000..726ff78 --- /dev/null +++ b/gmp-6.3.0/mpn/fib2m.c @@ -0,0 +1 @@ +../mpn/generic/fib2m.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/fib_table.c b/gmp-6.3.0/mpn/fib_table.c new file mode 100644 index 0000000..5d72e10 --- /dev/null +++ b/gmp-6.3.0/mpn/fib_table.c @@ -0,0 +1,61 @@ +/* This file generated by gen-fib.c - DO NOT EDIT. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if GMP_NUMB_BITS != 32 +Error, error, this data is for 32 bits +#endif + +const mp_limb_t +__gmp_fib_table[FIB_TABLE_LIMIT+2] = { + CNST_LIMB (0x1), /* -1 */ + CNST_LIMB (0x0), /* 0 */ + CNST_LIMB (0x1), /* 1 */ + CNST_LIMB (0x1), /* 2 */ + CNST_LIMB (0x2), /* 3 */ + CNST_LIMB (0x3), /* 4 */ + CNST_LIMB (0x5), /* 5 */ + CNST_LIMB (0x8), /* 6 */ + CNST_LIMB (0xd), /* 7 */ + CNST_LIMB (0x15), /* 8 */ + CNST_LIMB (0x22), /* 9 */ + CNST_LIMB (0x37), /* 10 */ + CNST_LIMB (0x59), /* 11 */ + CNST_LIMB (0x90), /* 12 */ + CNST_LIMB (0xe9), /* 13 */ + CNST_LIMB (0x179), /* 14 */ + CNST_LIMB (0x262), /* 15 */ + CNST_LIMB (0x3db), /* 16 */ + CNST_LIMB (0x63d), /* 17 */ + CNST_LIMB (0xa18), /* 18 */ + CNST_LIMB (0x1055), /* 19 */ + CNST_LIMB (0x1a6d), /* 20 */ + CNST_LIMB (0x2ac2), /* 21 */ + CNST_LIMB (0x452f), /* 22 */ + CNST_LIMB (0x6ff1), /* 23 */ + CNST_LIMB (0xb520), /* 24 */ + CNST_LIMB (0x12511), /* 25 */ + CNST_LIMB (0x1da31), /* 26 */ + CNST_LIMB (0x2ff42), /* 27 */ + CNST_LIMB (0x4d973), /* 28 */ + CNST_LIMB (0x7d8b5), /* 29 */ + CNST_LIMB (0xcb228), /* 30 */ + CNST_LIMB (0x148add), /* 31 */ + CNST_LIMB (0x213d05), /* 32 */ + CNST_LIMB (0x35c7e2), /* 33 */ + CNST_LIMB (0x5704e7), /* 34 */ + CNST_LIMB (0x8cccc9), /* 35 */ + CNST_LIMB (0xe3d1b0), /* 36 */ + CNST_LIMB (0x1709e79), /* 37 */ + CNST_LIMB (0x2547029), /* 38 */ + CNST_LIMB (0x3c50ea2), /* 39 */ + CNST_LIMB (0x6197ecb), /* 40 */ + CNST_LIMB (0x9de8d6d), /* 41 */ + CNST_LIMB (0xff80c38), /* 42 */ + CNST_LIMB (0x19d699a5), /* 43 */ + CNST_LIMB (0x29cea5dd), /* 44 */ + CNST_LIMB (0x43a53f82), /* 45 */ + CNST_LIMB (0x6d73e55f), /* 46 */ + CNST_LIMB (0xb11924e1), /* 47 */ +}; diff --git a/gmp-6.3.0/mpn/gcd.c b/gmp-6.3.0/mpn/gcd.c new file mode 120000 index 0000000..5d4771f --- /dev/null +++ b/gmp-6.3.0/mpn/gcd.c @@ -0,0 +1 @@ +../mpn/generic/gcd.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/gcd_1.c b/gmp-6.3.0/mpn/gcd_1.c new file mode 120000 index 0000000..8808854 --- /dev/null +++ b/gmp-6.3.0/mpn/gcd_1.c @@ -0,0 +1 @@ +../mpn/generic/gcd_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/gcd_11.asm b/gmp-6.3.0/mpn/gcd_11.asm new file mode 120000 index 0000000..3c7963e --- /dev/null +++ b/gmp-6.3.0/mpn/gcd_11.asm @@ -0,0 +1 @@ +../mpn/x86/p6/gcd_11.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/gcd_22.c b/gmp-6.3.0/mpn/gcd_22.c new file mode 120000 index 0000000..1b1dc02 --- /dev/null +++ b/gmp-6.3.0/mpn/gcd_22.c @@ -0,0 +1 @@ +../mpn/generic/gcd_22.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/gcd_subdiv_step.c b/gmp-6.3.0/mpn/gcd_subdiv_step.c new file mode 120000 index 0000000..13e22f7 --- /dev/null +++ b/gmp-6.3.0/mpn/gcd_subdiv_step.c @@ -0,0 +1 @@ +../mpn/generic/gcd_subdiv_step.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/gcdext.c b/gmp-6.3.0/mpn/gcdext.c new file mode 120000 index 0000000..a6a558c --- /dev/null +++ b/gmp-6.3.0/mpn/gcdext.c @@ -0,0 +1 @@ +../mpn/generic/gcdext.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/gcdext_1.c b/gmp-6.3.0/mpn/gcdext_1.c new file mode 120000 index 0000000..d713bc9 --- /dev/null +++ b/gmp-6.3.0/mpn/gcdext_1.c @@ -0,0 +1 @@ +../mpn/generic/gcdext_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/gcdext_lehmer.c b/gmp-6.3.0/mpn/gcdext_lehmer.c new file mode 120000 index 0000000..fcc8530 --- /dev/null +++ b/gmp-6.3.0/mpn/gcdext_lehmer.c @@ -0,0 +1 @@ +../mpn/generic/gcdext_lehmer.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/generic/add.c b/gmp-6.3.0/mpn/generic/add.c new file mode 100644 index 0000000..4a6e3ba --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add.c @@ -0,0 +1,33 @@ +/* mpn_add - add mpn to mpn. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_add 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/add_1.c b/gmp-6.3.0/mpn/generic/add_1.c new file mode 100644 index 0000000..1745aed --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_1.c @@ -0,0 +1,33 @@ +/* mpn_add_1 - add limb to mpn. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_add_1 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/add_err1_n.c b/gmp-6.3.0/mpn/generic/add_err1_n.c new file mode 100644 index 0000000..b247f19 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_err1_n.c @@ -0,0 +1,100 @@ +/* mpn_add_err1_n -- add_n with one error term + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy, + return value is carry out. + + (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy). + Computes c[1]*yp[n-1] + ... + c[n]*yp[0], stores two-limb result at ep. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_add_err1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el, eh, ul, vl, yl, zl, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, yp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, rp, n)); + + yp += n - 1; + el = eh = 0; + + do + { + yl = *yp--; + ul = *up++; + vl = *vp++; + + /* ordinary add_n */ + ADDC_LIMB (cy1, sl, ul, vl); + ADDC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh:el) */ + zl = (-cy) & yl; + el += zl; + eh += el < zl; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh = (eh << GMP_NAIL_BITS) + (el >> GMP_NUMB_BITS); + el &= GMP_NUMB_MASK; +#endif + + ep[0] = el; + ep[1] = eh; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/add_err2_n.c b/gmp-6.3.0/mpn/generic/add_err2_n.c new file mode 100644 index 0000000..d584d6d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_err2_n.c @@ -0,0 +1,116 @@ +/* mpn_add_err2_n -- add_n with two error terms + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy, + return value is carry out. + + (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy). + Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0], + c[1]*yp2[n-1] + ... + c[n]*yp2[0], + stores two-limb results at {ep,2} and {ep+2,2} respectively. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_add_err2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el1, eh1, el2, eh2, ul, vl, yl1, yl2, zl1, zl2, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, yp1, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, rp, n)); + + yp1 += n - 1; + yp2 += n - 1; + el1 = eh1 = 0; + el2 = eh2 = 0; + + do + { + yl1 = *yp1--; + yl2 = *yp2--; + ul = *up++; + vl = *vp++; + + /* ordinary add_n */ + ADDC_LIMB (cy1, sl, ul, vl); + ADDC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh1:el1) */ + zl1 = (-cy) & yl1; + el1 += zl1; + eh1 += el1 < zl1; + + /* update (eh2:el2) */ + zl2 = (-cy) & yl2; + el2 += zl2; + eh2 += el2 < zl2; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS); + el1 &= GMP_NUMB_MASK; + eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS); + el2 &= GMP_NUMB_MASK; +#endif + + ep[0] = el1; + ep[1] = eh1; + ep[2] = el2; + ep[3] = eh2; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/add_err3_n.c b/gmp-6.3.0/mpn/generic/add_err3_n.c new file mode 100644 index 0000000..a6ed4dc --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_err3_n.c @@ -0,0 +1,131 @@ +/* mpn_add_err3_n -- add_n with three error terms + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy, + return value is carry out. + + (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy). + Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0], + c[1]*yp2[n-1] + ... + c[n]*yp2[0], + c[1]*yp3[n-1] + ... + c[n]*yp3[0], + stores two-limb results at {ep,2}, {ep+2,2} and {ep+4,2} respectively. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_add_err3_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, mp_srcptr yp3, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el1, eh1, el2, eh2, el3, eh3, ul, vl, yl1, yl2, yl3, zl1, zl2, zl3, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp3, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp1, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp3, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, rp, n)); + + yp1 += n - 1; + yp2 += n - 1; + yp3 += n - 1; + el1 = eh1 = 0; + el2 = eh2 = 0; + el3 = eh3 = 0; + + do + { + yl1 = *yp1--; + yl2 = *yp2--; + yl3 = *yp3--; + ul = *up++; + vl = *vp++; + + /* ordinary add_n */ + ADDC_LIMB (cy1, sl, ul, vl); + ADDC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh1:el1) */ + zl1 = (-cy) & yl1; + el1 += zl1; + eh1 += el1 < zl1; + + /* update (eh2:el2) */ + zl2 = (-cy) & yl2; + el2 += zl2; + eh2 += el2 < zl2; + + /* update (eh3:el3) */ + zl3 = (-cy) & yl3; + el3 += zl3; + eh3 += el3 < zl3; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS); + el1 &= GMP_NUMB_MASK; + eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS); + el2 &= GMP_NUMB_MASK; + eh3 = (eh3 << GMP_NAIL_BITS) + (el3 >> GMP_NUMB_BITS); + el3 &= GMP_NUMB_MASK; +#endif + + ep[0] = el1; + ep[1] = eh1; + ep[2] = el2; + ep[3] = eh2; + ep[4] = el3; + ep[5] = eh3; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/add_n.c b/gmp-6.3.0/mpn/generic/add_n.c new file mode 100644 index 0000000..f62ac87 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_n.c @@ -0,0 +1,89 @@ +/* mpn_add_n -- Add equal length limb vectors. + +Copyright 1992-1994, 1996, 2000, 2002, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, sl, rl, cy, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n)); + + cy = 0; + do + { + ul = *up++; + vl = *vp++; + sl = ul + vl; + cy1 = sl < ul; + rl = sl + cy; + cy2 = rl < sl; + cy = cy1 | cy2; + *rp++ = rl; + } + while (--n != 0); + + return cy; +} + +#endif + +#if GMP_NAIL_BITS >= 1 + +mp_limb_t +mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, rl, cy; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n)); + + cy = 0; + do + { + ul = *up++; + vl = *vp++; + rl = ul + vl + cy; + cy = rl >> GMP_NUMB_BITS; + *rp++ = rl & GMP_NUMB_MASK; + } + while (--n != 0); + + return cy; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/add_n_sub_n.c b/gmp-6.3.0/mpn/generic/add_n_sub_n.c new file mode 100644 index 0000000..1e72b5d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_n_sub_n.c @@ -0,0 +1,172 @@ +/* mpn_add_n_sub_n -- Add and Subtract two limb vectors of equal, non-zero length. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1999-2001, 2006 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#ifndef L1_CACHE_SIZE +#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */ +#endif + +#define PART_SIZE (L1_CACHE_SIZE / GMP_LIMB_BYTES / 6) + + +/* mpn_add_n_sub_n. + r1[] = s1[] + s2[] + r2[] = s1[] - s2[] + All operands have n limbs. + In-place operations allowed. */ +mp_limb_t +mpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n) +{ + mp_limb_t acyn, acyo; /* carry for add */ + mp_limb_t scyn, scyo; /* carry for subtract */ + mp_size_t off; /* offset in operands */ + mp_size_t this_n; /* size of current chunk */ + + /* We alternatingly add and subtract in chunks that fit into the (L1) + cache. Since the chunks are several hundred limbs, the function call + overhead is insignificant, but we get much better locality. */ + + /* We have three variant of the inner loop, the proper loop is chosen + depending on whether r1 or r2 are the same operand as s1 or s2. */ + + if (r1p != s1p && r1p != s2p) + { + /* r1 is not identical to either input operand. We can therefore write + to r1 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + } + } + else if (r2p != s1p && r2p != s2p) + { + /* r2 is not identical to either input operand. We can therefore write + to r2 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_sub_nc + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif +#if HAVE_NATIVE_mpn_add_nc + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif + } + } + else + { + /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2==s2 or vice versa) + Need temporary storage. */ + mp_limb_t tp[PART_SIZE]; + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc + acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + MPN_COPY (r1p + off, tp, this_n); + } + } + + return 2 * acyo + scyo; +} + +#ifdef MAIN +#include +#include +#include "timing.h" + +long cputime (); + +int +main (int argc, char **argv) +{ + mp_ptr r1p, r2p, s1p, s2p; + double t; + mp_size_t n; + + n = strtol (argv[1], 0, 0); + + r1p = malloc (n * GMP_LIMB_BYTES); + r2p = malloc (n * GMP_LIMB_BYTES); + s1p = malloc (n * GMP_LIMB_BYTES); + s2p = malloc (n * GMP_LIMB_BYTES); + TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n))); + printf (" separate add and sub: %.3f\n", t); + TIME (t,mpn_add_n_sub_n(r1p,r2p,s1p,s2p,n)); + printf ("combined addsub separate variables: %.3f\n", t); + TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r1 overlap: %.3f\n", t); + TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r2 overlap: %.3f\n", t); + TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,r2p,n)); + printf (" combined addsub in-place: %.3f\n", t); + + return 0; +} +#endif diff --git a/gmp-6.3.0/mpn/generic/addmul_1.c b/gmp-6.3.0/mpn/generic/addmul_1.c new file mode 100644 index 0000000..6140e8e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/addmul_1.c @@ -0,0 +1,145 @@ +/* mpn_addmul_1 -- multiply the N long limb vector pointed to by UP by VL, + add the N least significant limbs of the product to the limb vector + pointed to by RP. Return the most significant limb of the product, + adjusted for carry-out from the addition. + +Copyright 1992-1994, 1996, 2000, 2002, 2004, 2016 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t u0, crec, c, p1, p0, r0; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + + crec = 0; + do + { + u0 = *up++; + umul_ppmm (p1, p0, u0, v0); + + r0 = *rp; + + p0 = r0 + p0; + c = r0 > p0; + + p1 = p1 + c; + + r0 = p0 + crec; /* cycle 0, 3, ... */ + c = p0 > r0; /* cycle 1, 4, ... */ + + crec = p1 + c; /* cycle 2, 5, ... */ + + *rp++ = r0; + } + while (--n != 0); + + return crec; +} + +#endif + +#if GMP_NAIL_BITS == 1 + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, crec, xl, c1, c2, c3; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (rp, n); + ASSERT_MPN (up, n); + ASSERT_LIMB (v0); + + shifted_v0 = v0 << GMP_NAIL_BITS; + crec = 0; + prev_p1 = 0; + do + { + u0 = *up++; + r0 = *rp; + umul_ppmm (p1, p0, u0, shifted_v0); + p0 >>= GMP_NAIL_BITS; + ADDC_LIMB (c1, xl, prev_p1, p0); + ADDC_LIMB (c2, xl, xl, r0); + ADDC_LIMB (c3, xl, xl, crec); + crec = c1 + c2 + c3; + *rp++ = xl; + prev_p1 = p1; + } + while (--n != 0); + + return prev_p1 + crec; +} + +#endif + +#if GMP_NAIL_BITS >= 2 + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, xw, crec, xl; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (rp, n); + ASSERT_MPN (up, n); + ASSERT_LIMB (v0); + + shifted_v0 = v0 << GMP_NAIL_BITS; + crec = 0; + prev_p1 = 0; + do + { + u0 = *up++; + r0 = *rp; + umul_ppmm (p1, p0, u0, shifted_v0); + p0 >>= GMP_NAIL_BITS; + xw = prev_p1 + p0 + r0 + crec; + crec = xw >> GMP_NUMB_BITS; + xl = xw & GMP_NUMB_MASK; + *rp++ = xl; + prev_p1 = p1; + } + while (--n != 0); + + return prev_p1 + crec; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/bdiv_dbm1c.c b/gmp-6.3.0/mpn/generic/bdiv_dbm1c.c new file mode 100644 index 0000000..543bb6e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bdiv_dbm1c.c @@ -0,0 +1,58 @@ +/* mpn_bdiv_dbm1c -- divide an mpn number by a divisor of B-1, where B is the + limb base. The dbm1c moniker means "Divisor of B Minus 1 with Carry". + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +mp_limb_t +mpn_bdiv_dbm1c (mp_ptr qp, mp_srcptr ap, mp_size_t n, mp_limb_t bd, mp_limb_t h) +{ + mp_limb_t a, p0, p1, cy; + mp_size_t i; + + for (i = 0; i < n; i++) + { + a = ap[i]; + umul_ppmm (p1, p0, a, bd << GMP_NAIL_BITS); + p0 >>= GMP_NAIL_BITS; + cy = h < p0; + h = (h - p0) & GMP_NUMB_MASK; + qp[i] = h; + h = h - p1 - cy; + } + + return h; +} diff --git a/gmp-6.3.0/mpn/generic/bdiv_q.c b/gmp-6.3.0/mpn/generic/bdiv_q.c new file mode 100644 index 0000000..52aa473 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bdiv_q.c @@ -0,0 +1,76 @@ +/* mpn_bdiv_q -- Hensel division with precomputed inverse, returning quotient. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes Q = N / D mod B^n. */ + +void +mpn_bdiv_q (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr tp) +{ + mp_limb_t di; + + if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD)) + { + MPN_COPY (tp, np, nn); + binvert_limb (di, dp[0]); di = -di; + mpn_sbpi1_bdiv_q (qp, tp, nn, dp, dn, di); + } + else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD)) + { + MPN_COPY (tp, np, nn); + binvert_limb (di, dp[0]); di = -di; + mpn_dcpi1_bdiv_q (qp, tp, nn, dp, dn, di); + } + else + { + mpn_mu_bdiv_q (qp, np, nn, dp, dn, tp); + } + return; +} + +mp_size_t +mpn_bdiv_q_itch (mp_size_t nn, mp_size_t dn) +{ + if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD)) + return nn; + else + return mpn_mu_bdiv_q_itch (nn, dn); +} diff --git a/gmp-6.3.0/mpn/generic/bdiv_q_1.c b/gmp-6.3.0/mpn/generic/bdiv_q_1.c new file mode 100644 index 0000000..6beb9a0 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bdiv_q_1.c @@ -0,0 +1,121 @@ +/* mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by 1-limb + divisor, returning quotient only. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003, 2005, 2009, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_pi1_bdiv_q_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t d, + mp_limb_t di, int shift) +{ + mp_size_t i; + mp_limb_t c, h, l, u, u_next, dummy; + + ASSERT (n >= 1); + ASSERT (d != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (up, n); + ASSERT_LIMB (d); + + d <<= GMP_NAIL_BITS; + + if (shift != 0) + { + c = 0; + + u = up[0]; + rp--; + for (i = 1; i < n; i++) + { + u_next = up[i]; + u = ((u >> shift) | (u_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK; + + SUBC_LIMB (c, l, u, c); + + l = (l * di) & GMP_NUMB_MASK; + rp[i] = l; + + umul_ppmm (h, dummy, l, d); + c += h; + u = u_next; + } + + u = u >> shift; + SUBC_LIMB (c, l, u, c); + + l = (l * di) & GMP_NUMB_MASK; + rp[n] = l; + } + else + { + u = up[0]; + l = (u * di) & GMP_NUMB_MASK; + rp[0] = l; + c = 0; + + for (i = 1; i < n; i++) + { + umul_ppmm (h, dummy, l, d); + c += h; + + u = up[i]; + SUBC_LIMB (c, l, u, c); + + l = (l * di) & GMP_NUMB_MASK; + rp[i] = l; + } + } + + return c; +} + +mp_limb_t +mpn_bdiv_q_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t d) +{ + mp_limb_t di; + int shift; + + ASSERT (n >= 1); + ASSERT (d != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (up, n); + ASSERT_LIMB (d); + + count_trailing_zeros (shift, d); + d >>= shift; + + binvert_limb (di, d); + return mpn_pi1_bdiv_q_1 (rp, up, n, d, di, shift); +} diff --git a/gmp-6.3.0/mpn/generic/bdiv_qr.c b/gmp-6.3.0/mpn/generic/bdiv_qr.c new file mode 100644 index 0000000..a4f0f39 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bdiv_qr.c @@ -0,0 +1,84 @@ +/* mpn_bdiv_qr -- Hensel division with precomputed inverse, returning quotient + and remainder. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes Q = N / D mod B^n, + R = N - QD. */ + +mp_limb_t +mpn_bdiv_qr (mp_ptr qp, mp_ptr rp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr tp) +{ + mp_limb_t di; + mp_limb_t rh; + + ASSERT (nn > dn); + if (BELOW_THRESHOLD (dn, DC_BDIV_QR_THRESHOLD) || + BELOW_THRESHOLD (nn - dn, DC_BDIV_QR_THRESHOLD)) + { + MPN_COPY (tp, np, nn); + binvert_limb (di, dp[0]); di = -di; + rh = mpn_sbpi1_bdiv_qr (qp, tp, nn, dp, dn, di); + MPN_COPY (rp, tp + nn - dn, dn); + } + else if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD)) + { + MPN_COPY (tp, np, nn); + binvert_limb (di, dp[0]); di = -di; + rh = mpn_dcpi1_bdiv_qr (qp, tp, nn, dp, dn, di); + MPN_COPY (rp, tp + nn - dn, dn); + } + else + { + rh = mpn_mu_bdiv_qr (qp, rp, np, nn, dp, dn, tp); + } + + return rh; +} + +mp_size_t +mpn_bdiv_qr_itch (mp_size_t nn, mp_size_t dn) +{ + if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD)) + return nn; + else + return mpn_mu_bdiv_qr_itch (nn, dn); +} diff --git a/gmp-6.3.0/mpn/generic/binvert.c b/gmp-6.3.0/mpn/generic/binvert.c new file mode 100644 index 0000000..a170e66 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/binvert.c @@ -0,0 +1,106 @@ +/* Compute {up,n}^(-1) mod B^n. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright (C) 2004-2007, 2009, 2012, 2017, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* + r[k+1] = r[k] - r[k] * (u*r[k] - 1) + r[k+1] = r[k] + r[k] - r[k]*(u*r[k]) +*/ + +#if TUNE_PROGRAM_BUILD +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t))) +#else +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (BINV_NEWTON_THRESHOLD)) +#endif + +mp_size_t +mpn_binvert_itch (mp_size_t n) +{ + mp_size_t itch_local = mpn_mulmod_bnm1_next_size (n); + mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, n, (n + 1) >> 1); + return itch_local + itch_out; +} + +void +mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch) +{ + mp_ptr xp; + mp_size_t rn, newrn; + mp_size_t sizes[NPOWS], *sizp; + mp_limb_t di; + + /* Compute the computation precisions from highest to lowest, leaving the + base case size in 'rn'. */ + sizp = sizes; + for (rn = n; ABOVE_THRESHOLD (rn, BINV_NEWTON_THRESHOLD); rn = (rn + 1) >> 1) + *sizp++ = rn; + + xp = scratch; + + /* Compute a base value of rn limbs. */ + MPN_ZERO (xp, rn); + xp[0] = 1; + binvert_limb (di, up[0]); + if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD)) + mpn_sbpi1_bdiv_q (rp, xp, rn, up, rn, -di); + else + mpn_dcpi1_bdiv_q (rp, xp, rn, up, rn, -di); + + mpn_neg (rp, rp, rn); + + /* Use Newton iterations to get the desired precision. */ + for (; rn < n; rn = newrn) + { + mp_size_t m; + newrn = *--sizp; + + /* X <- UR. */ + m = mpn_mulmod_bnm1_next_size (newrn); + mpn_mulmod_bnm1 (xp, m, up, newrn, rp, rn, xp + m); + /* Only the values in the range xp + rn .. xp + newrn - 1 are + used by the _mullo_n below. + Since m >= newrn, we do not need the following. */ + /* mpn_sub_1 (xp + m, xp, rn - (m - newrn), 1); */ + + /* R = R(X/B^rn) */ + mpn_mullo_n (rp + rn, rp, xp + rn, newrn - rn); + mpn_neg (rp + rn, rp + rn, newrn - rn); + } +} diff --git a/gmp-6.3.0/mpn/generic/broot.c b/gmp-6.3.0/mpn/generic/broot.c new file mode 100644 index 0000000..02fe75a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/broot.c @@ -0,0 +1,195 @@ +/* mpn_broot -- Compute hensel sqrt + + Contributed to the GNU project by Niels Möller + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Computes a^e (mod B). Uses right-to-left binary algorithm, since + typical use will have e small. */ +static mp_limb_t +powlimb (mp_limb_t a, mp_limb_t e) +{ + mp_limb_t r = 1; + mp_limb_t s = a; + + for (r = 1, s = a; e > 0; e >>= 1, s *= s) + if (e & 1) + r *= s; + + return r; +} + +/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd. + + Iterates + + r' <-- r - r * (a^{k-1} r^k - 1) / n + + If + + a^{k-1} r^k = 1 (mod 2^m), + + then + + a^{k-1} r'^k = 1 (mod 2^{2m}), + + Compute the update term as + + r' = r - (a^{k-1} r^{k+1} - r) / k + + where we still have cancellation of low limbs. + + */ +void +mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) +{ + mp_size_t sizes[GMP_LIMB_BITS * 2]; + mp_ptr akm1, tp, rnp, ep; + mp_limb_t a0, r0, km1, kp1h, kinv; + mp_size_t rn; + unsigned i; + + TMP_DECL; + + ASSERT (n > 0); + ASSERT (ap[0] & 1); + ASSERT (k & 1); + ASSERT (k >= 3); + + TMP_MARK; + + akm1 = TMP_ALLOC_LIMBS (4*n); + tp = akm1 + n; + + km1 = k-1; + /* FIXME: Could arrange the iteration so we don't need to compute + this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note + that we can use wraparound also for a*r, since the low half is + unchanged from the previous iteration. Or possibly mulmid. Also, + a r = a^{1/k}, so we get that value too, for free? */ + mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */ + + a0 = ap[0]; + binvert_limb (kinv, k); + + /* 4 bits: a^{1/k - 1} (mod 16): + + a % 8 + 1 3 5 7 + k%4 +------- + 1 |1 1 1 1 + 3 |1 9 9 1 + */ + r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8); + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */ + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */ + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */ +#if GMP_NUMB_BITS > 32 + { + unsigned prec = 32; + do + { + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); + prec *= 2; + } + while (prec < GMP_NUMB_BITS); + } +#endif + + rp[0] = r0; + if (n == 1) + { + TMP_FREE; + return; + } + + /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */ + kp1h = k/2 + 1; + + /* FIXME: Special case for two limb iteration. */ + rnp = TMP_ALLOC_LIMBS (2*n + 1); + ep = rnp + n; + + /* FIXME: Possible to this on the fly with some bit fiddling. */ + for (i = 0; n > 1; n = (n + 1)/2) + sizes[i++] = n; + + rn = 1; + + while (i-- > 0) + { + /* Compute x^{k+1}. */ + mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the + final iteration. */ + mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp); + + /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */ + + mpn_mullo_n (ep, rnp, akm1, sizes[i]); + ASSERT (mpn_cmp (ep, rp, rn) == 0); + + ASSERT (sizes[i] <= 2*rn); + mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0); + mpn_neg (rp + rn, rp + rn, sizes[i] - rn); + rn = sizes[i]; + } + TMP_FREE; +} + +/* Computes a^{1/k} (mod B^n). Both a and k must be odd. */ +void +mpn_broot (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) +{ + mp_ptr tp; + TMP_DECL; + + ASSERT (n > 0); + ASSERT (ap[0] & 1); + ASSERT (k & 1); + + if (k == 1) + { + MPN_COPY (rp, ap, n); + return; + } + + TMP_MARK; + tp = TMP_ALLOC_LIMBS (n); + + mpn_broot_invm1 (tp, ap, n, k); + mpn_mullo_n (rp, tp, ap, n); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/brootinv.c b/gmp-6.3.0/mpn/generic/brootinv.c new file mode 100644 index 0000000..e91b597 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/brootinv.c @@ -0,0 +1,159 @@ +/* mpn_brootinv, compute r such that r^k * y = 1 (mod 2^b). + + Contributed to the GNU project by Martin Boij (as part of perfpow.c). + +Copyright 2009, 2010, 2012, 2013, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Computes a^2e (mod B). Uses right-to-left binary algorithm, since + typical use will have e small. */ +static mp_limb_t +powsquaredlimb (mp_limb_t a, mp_limb_t e) +{ + mp_limb_t r; + + r = 1; + /* if (LIKELY (e != 0)) */ + do { + a *= a; + if (e & 1) + r *= a; + e >>= 1; + } while (e != 0); + + return r; +} + +/* Compute r such that r^k * y = 1 (mod B^n). + + Iterates + r' <-- k^{-1} ((k+1) r - r^{k+1} y) (mod 2^b) + using Hensel lifting, each time doubling the number of known bits in r. + + Works just for odd k. Else the Hensel lifting degenerates. + + FIXME: + + (1) Make it work for k == GMP_LIMB_MAX (k+1 below overflows). + + (2) Rewrite iteration as + r' <-- r - k^{-1} r (r^k y - 1) + and take advantage of the zero low part of r^k y - 1. + + (3) Use wrap-around trick. + + (4) Use a small table to get starting value. + + Scratch need: bn + (((bn + 1) >> 1) + 1) + scratch for mpn_powlo + Currently mpn_powlo requires 3*bn + so that 5*bn is surely enough, where bn = ceil (bnb / GMP_NUMB_BITS). +*/ + +void +mpn_brootinv (mp_ptr rp, mp_srcptr yp, mp_size_t bn, mp_limb_t k, mp_ptr tp) +{ + mp_ptr tp2, tp3; + mp_limb_t kinv, k2, r0, y0; + mp_size_t order[GMP_LIMB_BITS + 1]; + int d; + + ASSERT (bn > 0); + ASSERT ((k & 1) != 0); + + tp2 = tp + bn; + tp3 = tp + bn + ((bn + 3) >> 1); + k2 = (k >> 1) + 1; /* (k + 1) / 2 , but avoid k+1 overflow */ + + binvert_limb (kinv, k); + + /* 4-bit initial approximation: + + y%16 | 1 3 5 7 9 11 13 15, + k%4 +-------------------------+k2%2 + 1 | 1 11 13 7 9 3 5 15 | 1 + 3 | 1 3 5 7 9 11 13 15 | 0 + + */ + y0 = yp[0]; + + r0 = y0 ^ (((y0 << 1) ^ (y0 << 2)) & (k2 << 3) & 8); /* 4 bits */ + r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2 & 0x3f)); /* 8 bits */ + r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2 & 0x3fff)); /* 16 bits */ +#if GMP_NUMB_BITS > 16 + { + unsigned prec = 16; + do + { + r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2)); + prec *= 2; + } + while (prec < GMP_NUMB_BITS); + } +#endif + + rp[0] = r0; + if (bn == 1) + return; + + d = 0; + for (; bn != 2; bn = (bn + 1) >> 1) + order[d++] = bn; + + order[d] = 2; + bn = 1; + + do + { + mpn_sqr (tp, rp, bn); /* Result may overlap tp2 */ + tp2[bn] = mpn_mul_1 (tp2, rp, bn, k2 << 1); + + bn = order[d]; + + mpn_powlo (rp, tp, &k2, 1, bn, tp3); + mpn_mullo_n (tp, yp, rp, bn); + + /* mpn_sub (tp, tp2, ((bn + 1) >> 1) + 1, tp, bn); */ + /* The function above is not handled, ((bn + 1) >> 1) + 1 <= bn*/ + { + mp_size_t pbn = (bn + 3) >> 1; /* Size of tp2 */ + int borrow; + borrow = mpn_sub_n (tp, tp2, tp, pbn) != 0; + if (bn > pbn) /* 3 < bn */ + { + if (borrow) + mpn_com (tp + pbn, tp + pbn, bn - pbn); + else + mpn_neg (tp + pbn, tp + pbn, bn - pbn); + } + } + mpn_pi1_bdiv_q_1 (rp, tp, bn, k, kinv, 0); + } + while (--d >= 0); +} diff --git a/gmp-6.3.0/mpn/generic/bsqrt.c b/gmp-6.3.0/mpn/generic/bsqrt.c new file mode 100644 index 0000000..27184f0 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bsqrt.c @@ -0,0 +1,47 @@ +/* mpn_bsqrt, a^{1/2} (mod 2^n). + +Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +void +mpn_bsqrt (mp_ptr rp, mp_srcptr ap, mp_bitcnt_t nb, mp_ptr tp) +{ + mp_ptr sp; + mp_size_t n; + + ASSERT (nb > 0); + + n = nb / GMP_NUMB_BITS; + sp = tp + n; + + mpn_bsqrtinv (tp, ap, nb, sp); + mpn_mullo_n (rp, tp, ap, n); +} diff --git a/gmp-6.3.0/mpn/generic/bsqrtinv.c b/gmp-6.3.0/mpn/generic/bsqrtinv.c new file mode 100644 index 0000000..c286773 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bsqrtinv.c @@ -0,0 +1,103 @@ +/* mpn_bsqrtinv, compute r such that r^2 * y = 1 (mod 2^{b+1}). + + Contributed to the GNU project by Martin Boij (as part of perfpow.c). + +Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Compute r such that r^2 * y = 1 (mod 2^{b+1}). + Return non-zero if such an integer r exists. + + Iterates + r' <-- (3r - r^3 y) / 2 + using Hensel lifting. Since we divide by two, the Hensel lifting is + somewhat degenerates. Therefore, we lift from 2^b to 2^{b+1}-1. + + FIXME: + (1) Simplify to do precision book-keeping in limbs rather than bits. + + (2) Rewrite iteration as + r' <-- r - r (r^2 y - 1) / 2 + and take advantage of zero low part of r^2 y - 1. + + (3) Use wrap-around trick. + + (4) Use a small table to get starting value. +*/ +int +mpn_bsqrtinv (mp_ptr rp, mp_srcptr yp, mp_bitcnt_t bnb, mp_ptr tp) +{ + mp_ptr tp2; + mp_size_t bn, order[GMP_LIMB_BITS + 1]; + int i, d; + + ASSERT (bnb > 0); + + bn = 1 + bnb / GMP_LIMB_BITS; + + tp2 = tp + bn; + + rp[0] = 1; + if (bnb == 1) + { + if ((yp[0] & 3) != 1) + return 0; + } + else + { + if ((yp[0] & 7) != 1) + return 0; + + d = 0; + for (; bnb != 2; bnb = (bnb + 2) >> 1) + order[d++] = bnb; + + for (i = d - 1; i >= 0; i--) + { + bnb = order[i]; + bn = 1 + bnb / GMP_LIMB_BITS; + + mpn_sqrlo (tp, rp, bn); + mpn_mullo_n (tp2, rp, tp, bn); /* tp2 <- rp ^ 3 */ + + mpn_mul_1 (tp, rp, bn, 3); + + mpn_mullo_n (rp, yp, tp2, bn); + +#if HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (rp, tp, rp, bn); +#else + mpn_sub_n (tp2, tp, rp, bn); + mpn_rshift (rp, tp2, bn, 1); +#endif + } + } + return 1; +} diff --git a/gmp-6.3.0/mpn/generic/cmp.c b/gmp-6.3.0/mpn/generic/cmp.c new file mode 100644 index 0000000..940314b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/cmp.c @@ -0,0 +1,33 @@ +/* mpn_cmp -- Compare two low-level natural-number integers. + +Copyright 1991, 1993, 1994, 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_cmp 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/cnd_add_n.c b/gmp-6.3.0/mpn/generic/cnd_add_n.c new file mode 100644 index 0000000..e6b1373 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/cnd_add_n.c @@ -0,0 +1,69 @@ +/* mpn_cnd_add_n -- Compute R = U + V if CND != 0 or R = U if CND == 0. + Both cases should take the same time and perform the exact same memory + accesses, since this function is intended to be used where side-channel + attack resilience is relevant. + +Copyright 1992-1994, 1996, 2000, 2002, 2008, 2009, 2011, 2013 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_cnd_add_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + + mask = -(mp_limb_t) (cnd != 0); + cy = 0; + do + { + ul = *up++; + vl = *vp++ & mask; +#if GMP_NAIL_BITS == 0 + sl = ul + vl; + cy1 = sl < ul; + rl = sl + cy; + cy2 = rl < sl; + cy = cy1 | cy2; + *rp++ = rl; +#else + rl = ul + vl; + rl += cy; + cy = rl >> GMP_NUMB_BITS; + *rp++ = rl & GMP_NUMB_MASK; +#endif + } + while (--n != 0); + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/cnd_sub_n.c b/gmp-6.3.0/mpn/generic/cnd_sub_n.c new file mode 100644 index 0000000..d04ad8a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/cnd_sub_n.c @@ -0,0 +1,69 @@ +/* mpn_cnd_sub_n -- Compute R = U - V if CND != 0 or R = U if CND == 0. + Both cases should take the same time and perform the exact same memory + accesses, since this function is intended to be used where side-channel + attack resilience is relevant. + +Copyright 1992-1994, 1996, 2000, 2002, 2008, 2009, 2011, 2013 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_cnd_sub_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + + mask = -(mp_limb_t) (cnd != 0); + cy = 0; + do + { + ul = *up++; + vl = *vp++ & mask; +#if GMP_NAIL_BITS == 0 + sl = ul - vl; + cy1 = sl > ul; + rl = sl - cy; + cy2 = rl > sl; + cy = cy1 | cy2; + *rp++ = rl; +#else + rl = ul - vl; + rl -= cy; + cy = rl >> (GMP_LIMB_BITS - 1); + *rp++ = rl & GMP_NUMB_MASK; +#endif + } + while (--n != 0); + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/cnd_swap.c b/gmp-6.3.0/mpn/generic/cnd_swap.c new file mode 100644 index 0000000..83d856d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/cnd_swap.c @@ -0,0 +1,50 @@ +/* mpn_cnd_swap + + Contributed to the GNU project by Niels Möller + +Copyright 2013, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_cnd_swap (mp_limb_t cnd, volatile mp_limb_t *ap, volatile mp_limb_t *bp, + mp_size_t n) +{ + volatile mp_limb_t mask = - (mp_limb_t) (cnd != 0); + mp_size_t i; + for (i = 0; i < n; i++) + { + mp_limb_t a, b, t; + a = ap[i]; + b = bp[i]; + t = (a ^ b) & mask; + ap[i] = a ^ t; + bp[i] = b ^ t; + } +} diff --git a/gmp-6.3.0/mpn/generic/com.c b/gmp-6.3.0/mpn/generic/com.c new file mode 100644 index 0000000..4de5824 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/com.c @@ -0,0 +1,44 @@ +/* mpn_com - complement an mpn. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef mpn_com +#define mpn_com __MPN(com) + +void +mpn_com (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_limb_t ul; + do { + ul = *up++; + *rp++ = ~ul & GMP_NUMB_MASK; + } while (--n != 0); +} diff --git a/gmp-6.3.0/mpn/generic/comb_tables.c b/gmp-6.3.0/mpn/generic/comb_tables.c new file mode 100644 index 0000000..dedb77b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/comb_tables.c @@ -0,0 +1,47 @@ +/* Const tables shared among combinatoric functions. + + THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND ARE ALMOST CERTAIN TO + BE SUBJECT TO INCOMPATIBLE CHANGES IN FUTURE GNU MP RELEASES. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Entry i contains (i!/2^t) where t is chosen such that the parenthesis + is an odd integer. */ +const mp_limb_t __gmp_oddfac_table[] = { ONE_LIMB_ODD_FACTORIAL_TABLE, ONE_LIMB_ODD_FACTORIAL_EXTTABLE }; + +/* Entry i contains ((2i+1)!!/2^t) where t is chosen such that the parenthesis + is an odd integer. */ +const mp_limb_t __gmp_odd2fac_table[] = { ONE_LIMB_ODD_DOUBLEFACTORIAL_TABLE }; + +/* Entry i contains 2i-popc(2i). */ +const unsigned char __gmp_fac2cnt_table[] = { TABLE_2N_MINUS_POPC_2N }; + +const mp_limb_t __gmp_limbroots_table[] = { NTH_ROOT_NUMB_MASK_TABLE }; diff --git a/gmp-6.3.0/mpn/generic/compute_powtab.c b/gmp-6.3.0/mpn/generic/compute_powtab.c new file mode 100644 index 0000000..f4fbc64 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/compute_powtab.c @@ -0,0 +1,373 @@ +/* mpn_compute_powtab. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* + CAVEATS: + * The exptab and powtab vectors are in opposite orders. Probably OK. + * Consider getting rid of exptab, doing bit ops on the un argument instead. + * Consider rounding greatest power slightly upwards to save adjustments. + * In powtab_decide, consider computing cost from just the 2-3 largest + operands, since smaller operand contribute little. This makes most sense + if exptab is suppressed. +*/ + +#include "gmp-impl.h" + +#ifndef DIV_1_VS_MUL_1_PERCENT +#define DIV_1_VS_MUL_1_PERCENT 150 +#endif + +#define SET_powers_t(dest, ptr, size, dib, b, sh) \ + do { \ + dest.p = ptr; \ + dest.n = size; \ + dest.digits_in_base = dib; \ + dest.base = b; \ + dest.shift = sh; \ + } while (0) + +#if DIV_1_VS_MUL_1_PERCENT > 120 +#define HAVE_mpn_compute_powtab_mul 1 +static void +mpn_compute_powtab_mul (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, + int base, const size_t *exptab, size_t n_pows) +{ + mp_size_t n; + mp_ptr p, t; + mp_limb_t cy; + long start_idx; + int c; + + mp_limb_t big_base = mp_bases[base].big_base; + int chars_per_limb = mp_bases[base].chars_per_limb; + + mp_ptr powtab_mem_ptr = powtab_mem; + + size_t digits_in_base = chars_per_limb; + + powers_t *pt = powtab; + + p = powtab_mem_ptr; + powtab_mem_ptr += 1; + p[0] = big_base; + + SET_powers_t (pt[0], p, 1, digits_in_base, base, 0); + pt++; + + t = powtab_mem_ptr; + powtab_mem_ptr += 2; + t[1] = mpn_mul_1 (t, p, 1, big_base); + n = 2; + + digits_in_base *= 2; + + c = t[0] == 0; + t += c; + n -= c; + mp_size_t shift = c; + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + p = t; + pt++; + + if (exptab[0] == ((size_t) chars_per_limb << n_pows)) + { + start_idx = n_pows - 2; + } + else + { + if (((digits_in_base + chars_per_limb) << (n_pows-2)) <= exptab[0]) + { + /* 3, sometimes adjusted to 4. */ + t = powtab_mem_ptr; + powtab_mem_ptr += 4; + t[n] = cy = mpn_mul_1 (t, p, n, big_base); + n += cy != 0;; + + digits_in_base += chars_per_limb; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + } + else + { + /* 2 copy, will always become 3 with back-multiplication. */ + t = powtab_mem_ptr; + powtab_mem_ptr += 3; + t[0] = p[0]; + t[1] = p[1]; + } + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + p = t; + pt++; + start_idx = n_pows - 3; + } + + for (long pi = start_idx; pi >= 0; pi--) + { + t = powtab_mem_ptr; + powtab_mem_ptr += 2 * n + 2; + + ASSERT (powtab_mem_ptr < powtab_mem + mpn_str_powtab_alloc (un)); + + mpn_sqr (t, p, n); + + digits_in_base *= 2; + n *= 2; + n -= t[n - 1] == 0; + shift *= 2; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + + /* Adjust new value if it is too small as input to the next squaring. */ + if (((digits_in_base + chars_per_limb) << pi) <= exptab[0]) + { + t[n] = cy = mpn_mul_1 (t, t, n, big_base); + n += cy != 0; + + digits_in_base += chars_per_limb; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + } + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + + /* Adjust previous value if it is not at its target power. */ + if (pt[-1].digits_in_base < exptab[pi + 1]) + { + mp_size_t n = pt[-1].n; + mp_ptr p = pt[-1].p; + p[n] = cy = mpn_mul_1 (p, p, n, big_base); + n += cy != 0; + + ASSERT (pt[-1].digits_in_base + chars_per_limb == exptab[pi + 1]); + pt[-1].digits_in_base = exptab[pi + 1]; + + c = p[0] == 0; + pt[-1].p = p + c; + pt[-1].n = n - c; + pt[-1].shift += c; + } + + p = t; + pt++; + } +} +#endif + +#if DIV_1_VS_MUL_1_PERCENT < 275 +#define HAVE_mpn_compute_powtab_div 1 +static void +mpn_compute_powtab_div (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, + int base, const size_t *exptab, size_t n_pows) +{ + mp_ptr p, t; + + mp_limb_t big_base = mp_bases[base].big_base; + int chars_per_limb = mp_bases[base].chars_per_limb; + + mp_ptr powtab_mem_ptr = powtab_mem; + + size_t digits_in_base = chars_per_limb; + + powers_t *pt = powtab; + + p = powtab_mem_ptr; + powtab_mem_ptr += 1; + p[0] = big_base; + + SET_powers_t (pt[0], p, 1, digits_in_base, base, 0); + pt++; + + mp_size_t n = 1; + mp_size_t shift = 0; + for (long pi = n_pows - 1; pi >= 0; pi--) + { + t = powtab_mem_ptr; + powtab_mem_ptr += 2 * n; + + ASSERT (powtab_mem_ptr < powtab_mem + mpn_str_powtab_alloc (un)); + + mpn_sqr (t, p, n); + n = 2 * n - 1; n += t[n] != 0; + digits_in_base *= 2; + + if (digits_in_base != exptab[pi]) /* if ((((un - 1) >> pi) & 2) == 0) */ + { +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 + if (__GMP_LIKELY (base == 10)) + mpn_pi1_bdiv_q_1 (t, t, n, big_base >> MP_BASES_BIG_BASE_CTZ_10, + MP_BASES_BIG_BASE_BINVERTED_10, + MP_BASES_BIG_BASE_CTZ_10); + else +#endif + /* FIXME: We could use _pi1 here if we add big_base_binverted and + big_base_ctz fields to struct bases. That would add about 2 KiB + to mp_bases.c. + FIXME: Use mpn_bdiv_q_1 here when mpn_divexact_1 is converted to + mpn_bdiv_q_1 for more machines. */ + mpn_divexact_1 (t, t, n, big_base); + + n -= t[n - 1] == 0; + digits_in_base -= chars_per_limb; + } + + shift *= 2; + /* Strip low zero limbs, but be careful to keep the result divisible by + big_base. */ + while (t[0] == 0 && (t[1] & ((big_base & -big_base) - 1)) == 0) + { + t++; + n--; + shift++; + } + p = t; + + SET_powers_t (pt[0], p, n, digits_in_base, base, shift); + pt++; + } + + /* Strip any remaining low zero limbs. */ + pt -= n_pows + 1; + for (long pi = n_pows; pi >= 0; pi--) + { + mp_ptr t = pt[pi].p; + mp_size_t shift = pt[pi].shift; + mp_size_t n = pt[pi].n; + int c; + c = t[0] == 0; + t += c; + n -= c; + shift += c; + pt[pi].p = t; + pt[pi].shift = shift; + pt[pi].n = n; + } +} +#endif + +static long +powtab_decide (size_t *exptab, size_t un, int base) +{ + int chars_per_limb = mp_bases[base].chars_per_limb; + long n_pows = 0; + for (size_t pn = (un + 1) >> 1; pn != 1; pn = (pn + 1) >> 1) + { + exptab[n_pows] = pn * chars_per_limb; + n_pows++; + } + exptab[n_pows] = chars_per_limb; + +#if HAVE_mpn_compute_powtab_mul && HAVE_mpn_compute_powtab_div + size_t pn = un - 1; + size_t xn = (un + 1) >> 1; + unsigned mcost = 1; + unsigned dcost = 1; + for (long i = n_pows - 2; i >= 0; i--) + { + size_t pow = (pn >> (i + 1)) + 1; + + if (pow & 1) + dcost += pow; + + if (xn != (pow << i)) + { + if (pow > 2 && (pow & 1) == 0) + mcost += 2 * pow; + else + mcost += pow; + } + else + { + if (pow & 1) + mcost += pow; + } + } + + dcost = dcost * DIV_1_VS_MUL_1_PERCENT / 100; + + if (mcost <= dcost) + return n_pows; + else + return -n_pows; +#elif HAVE_mpn_compute_powtab_mul + return n_pows; +#elif HAVE_mpn_compute_powtab_div + return -n_pows; +#else +#error "no powtab function available" +#endif +} + +size_t +mpn_compute_powtab (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, int base) +{ + size_t exptab[GMP_LIMB_BITS]; + + long n_pows = powtab_decide (exptab, un, base); + +#if HAVE_mpn_compute_powtab_mul && HAVE_mpn_compute_powtab_div + if (n_pows >= 0) + { + mpn_compute_powtab_mul (powtab, powtab_mem, un, base, exptab, n_pows); + return n_pows; + } + else + { + mpn_compute_powtab_div (powtab, powtab_mem, un, base, exptab, -n_pows); + return -n_pows; + } +#elif HAVE_mpn_compute_powtab_mul + ASSERT (n_pows > 0); + mpn_compute_powtab_mul (powtab, powtab_mem, un, base, exptab, n_pows); + return n_pows; +#elif HAVE_mpn_compute_powtab_div + ASSERT (n_pows < 0); + mpn_compute_powtab_div (powtab, powtab_mem, un, base, exptab, -n_pows); + return -n_pows; +#else +#error "no powtab function available" +#endif +} diff --git a/gmp-6.3.0/mpn/generic/copyd.c b/gmp-6.3.0/mpn/generic/copyd.c new file mode 100644 index 0000000..7def007 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/copyd.c @@ -0,0 +1,40 @@ +/* mpn_copyd + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_copyd (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + + for (i = n - 1; i >= 0; i--) + rp[i] = up[i]; +} diff --git a/gmp-6.3.0/mpn/generic/copyi.c b/gmp-6.3.0/mpn/generic/copyi.c new file mode 100644 index 0000000..736e0b5 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/copyi.c @@ -0,0 +1,42 @@ +/* mpn_copyi + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_copyi (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + + up += n; + rp += n; + for (i = -n; i != 0; i++) + rp[i] = up[i]; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c b/gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c new file mode 100644 index 0000000..3c21818 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c @@ -0,0 +1,161 @@ +/* mpn_dcpi1_bdiv_q -- divide-and-conquer Hensel division with precomputed + inverse, returning quotient. + + Contributed to the GNU project by Niels Möller and Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009-2011, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#if 0 /* unused, so leave out for now */ +static mp_size_t +mpn_dcpi1_bdiv_q_n_itch (mp_size_t n) +{ + /* NOTE: Depends on mullo_n and mpn_dcpi1_bdiv_qr_n interface */ + return n; +} +#endif + +/* Computes Q = - N / D mod B^n, destroys N. + + N = {np,n} + D = {dp,n} +*/ + +static void +mpn_dcpi1_bdiv_q_n (mp_ptr qp, + mp_ptr np, mp_srcptr dp, mp_size_t n, + mp_limb_t dinv, mp_ptr tp) +{ + while (ABOVE_THRESHOLD (n, DC_BDIV_Q_THRESHOLD)) + { + mp_size_t lo, hi; + mp_limb_t cy; + + lo = n >> 1; /* floor(n/2) */ + hi = n - lo; /* ceil(n/2) */ + + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, lo, dinv, tp); + + mpn_mullo_n (tp, qp, dp + hi, lo); + mpn_add_n (np + hi, np + hi, tp, lo); + + if (lo < hi) + { + cy += mpn_addmul_1 (np + lo, qp, lo, dp[lo]); + np[n - 1] += cy; + } + qp += lo; + np += lo; + n -= lo; + } + mpn_sbpi1_bdiv_q (qp, np, n, dp, n, dinv); +} + +/* Computes Q = - N / D mod B^nn, destroys N. + + N = {np,nn} + D = {dp,dn} +*/ + +void +mpn_dcpi1_bdiv_q (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_size_t qn; + mp_limb_t cy; + mp_ptr tp; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 2); + ASSERT (nn - dn >= 0); + ASSERT (dp[0] & 1); + + tp = TMP_SALLOC_LIMBS (dn); + + qn = nn; + + if (qn > dn) + { + /* Reduce qn mod dn in a super-efficient manner. */ + do + qn -= dn; + while (qn > dn); + + /* Perform the typically smaller block first. */ + if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); + + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp + qn, dn - qn); + else + mpn_mul (tp, dp + qn, dn - qn, qp, qn); + mpn_incr_u (tp + qn, cy); + + mpn_add (np + qn, np + qn, nn - qn, tp, dn); + cy = 0; + } + + np += qn; + qp += qn; + + qn = nn - qn; + while (qn > dn) + { + mpn_add_1 (np + dn, np + dn, qn - dn, cy); + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp); + qp += dn; + np += dn; + qn -= dn; + } + mpn_dcpi1_bdiv_q_n (qp, np, dp, dn, dinv, tp); + } + else + { + if (BELOW_THRESHOLD (qn, DC_BDIV_Q_THRESHOLD)) + mpn_sbpi1_bdiv_q (qp, np, qn, dp, qn, dinv); + else + mpn_dcpi1_bdiv_q_n (qp, np, dp, qn, dinv, tp); + } + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c b/gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c new file mode 100644 index 0000000..11da44f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c @@ -0,0 +1,176 @@ +/* mpn_dcpi1_bdiv_qr -- divide-and-conquer Hensel division with precomputed + inverse, returning quotient and remainder. + + Contributed to the GNU project by Niels Möller and Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2010, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes Hensel binary division of {np, 2*n} by {dp, n}. + + Output: + + q = -n * d^{-1} mod 2^{qn * GMP_NUMB_BITS}, + + r = (n + q * d) * 2^{-qn * GMP_NUMB_BITS} + + Stores q at qp. Stores the n least significant limbs of r at the high half + of np, and returns the carry from the addition n + q*d. + + d must be odd. dinv is (-d)^-1 mod 2^GMP_NUMB_BITS. */ + +mp_size_t +mpn_dcpi1_bdiv_qr_n_itch (mp_size_t n) +{ + return n; +} + +mp_limb_t +mpn_dcpi1_bdiv_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, + mp_limb_t dinv, mp_ptr tp) +{ + mp_size_t lo, hi; + mp_limb_t cy; + mp_limb_t rh; + + lo = n >> 1; /* floor(n/2) */ + hi = n - lo; /* ceil(n/2) */ + + if (BELOW_THRESHOLD (lo, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * lo, dp, lo, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, lo, dinv, tp); + + mpn_mul (tp, dp + lo, hi, qp, lo); + + mpn_incr_u (tp + lo, cy); + rh = mpn_add (np + lo, np + lo, n + hi, tp, n); + + if (BELOW_THRESHOLD (hi, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp + lo, np + lo, 2 * hi, dp, hi, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp + lo, np + lo, dp, hi, dinv, tp); + + mpn_mul (tp, qp + lo, hi, dp + hi, lo); + + mpn_incr_u (tp + hi, cy); + rh += mpn_add_n (np + n, np + n, tp, n); + + return rh; +} + +mp_limb_t +mpn_dcpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) +{ + mp_size_t qn; + mp_limb_t rr, cy; + mp_ptr tp; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 2); /* to adhere to mpn_sbpi1_div_qr's limits */ + ASSERT (nn - dn >= 1); /* to adhere to mpn_sbpi1_div_qr's limits */ + ASSERT (dp[0] & 1); + + tp = TMP_SALLOC_LIMBS (dn); + + qn = nn - dn; + + if (qn > dn) + { + /* Reduce qn mod dn without division, optimizing small operations. */ + do + qn -= dn; + while (qn > dn); + + /* Perform the typically smaller block first. */ + if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); + + rr = 0; + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp + qn, dn - qn); + else + mpn_mul (tp, dp + qn, dn - qn, qp, qn); + mpn_incr_u (tp + qn, cy); + + rr = mpn_add (np + qn, np + qn, nn - qn, tp, dn); + cy = 0; + } + + np += qn; + qp += qn; + + qn = nn - dn - qn; + do + { + rr += mpn_add_1 (np + dn, np + dn, qn, cy); + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp); + qp += dn; + np += dn; + qn -= dn; + } + while (qn > 0); + TMP_FREE; + return rr + cy; + } + + if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); + + rr = 0; + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp + qn, dn - qn); + else + mpn_mul (tp, dp + qn, dn - qn, qp, qn); + mpn_incr_u (tp + qn, cy); + + rr = mpn_add (np + qn, np + qn, nn - qn, tp, dn); + cy = 0; + } + + TMP_FREE; + return rr + cy; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_div_q.c b/gmp-6.3.0/mpn/generic/dcpi1_div_q.c new file mode 100644 index 0000000..1905c98 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_div_q.c @@ -0,0 +1,86 @@ +/* mpn_dc_div_q -- divide-and-conquer division, returning exact quotient + only. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +mp_limb_t +mpn_dcpi1_div_q (mp_ptr qp, mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv) +{ + mp_ptr tp, wp; + mp_limb_t qh; + mp_size_t qn; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 6); + ASSERT (nn - dn >= 3); + ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); + + tp = TMP_ALLOC_LIMBS (nn + 1); + MPN_COPY (tp + 1, np, nn); + tp[0] = 0; + + qn = nn - dn; + wp = TMP_ALLOC_LIMBS (qn + 1); + + qh = mpn_dcpi1_divappr_q (wp, tp, nn + 1, dp, dn, dinv); + + if (wp[0] == 0) + { + mp_limb_t cy; + + if (qn > dn) + mpn_mul (tp, wp + 1, qn, dp, dn); + else + mpn_mul (tp, dp, dn, wp + 1, qn); + + cy = (qh != 0) ? mpn_add_n (tp + qn, tp + qn, dp, dn) : 0; + + if (cy || mpn_cmp (tp, np, nn) > 0) /* At most is wrong by one, no cycle. */ + qh -= mpn_sub_1 (qp, wp + 1, qn, 1); + else /* Same as below */ + MPN_COPY (qp, wp + 1, qn); + } + else + MPN_COPY (qp, wp + 1, qn); + + TMP_FREE; + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_div_qr.c b/gmp-6.3.0/mpn/generic/dcpi1_div_qr.c new file mode 100644 index 0000000..d7a65f8 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_div_qr.c @@ -0,0 +1,248 @@ +/* mpn_dcpi1_div_qr_n -- recursive divide-and-conquer division for arbitrary + size operands. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +mp_limb_t +mpn_dcpi1_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, + gmp_pi1_t *dinv, mp_ptr tp) +{ + mp_size_t lo, hi; + mp_limb_t cy, qh, ql; + + lo = n >> 1; /* floor(n/2) */ + hi = n - lo; /* ceil(n/2) */ + + if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp); + + mpn_mul (tp, qp + lo, hi, dp, lo); + + cy = mpn_sub_n (np + lo, np + lo, tp, n); + if (qh != 0) + cy += mpn_sub_n (np + n, np + n, dp, lo); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1); + cy -= mpn_add_n (np + lo, np + lo, dp, n); + } + + if (BELOW_THRESHOLD (lo, DC_DIV_QR_THRESHOLD)) + ql = mpn_sbpi1_div_qr (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32); + else + ql = mpn_dcpi1_div_qr_n (qp, np + hi, dp + hi, lo, dinv, tp); + + mpn_mul (tp, dp, hi, qp, lo); + + cy = mpn_sub_n (np, np, tp, n); + if (ql != 0) + cy += mpn_sub_n (np + lo, np + lo, dp, hi); + + while (cy != 0) + { + mpn_sub_1 (qp, qp, lo, 1); + cy -= mpn_add_n (np, np, dp, n); + } + + return qh; +} + +mp_limb_t +mpn_dcpi1_div_qr (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + gmp_pi1_t *dinv) +{ + mp_size_t qn; + mp_limb_t qh, cy; + mp_ptr tp; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 6); /* to adhere to mpn_sbpi1_div_qr's limits */ + ASSERT (nn - dn >= 3); /* to adhere to mpn_sbpi1_div_qr's limits */ + ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); + + tp = TMP_ALLOC_LIMBS (dn); + + qn = nn - dn; + qp += qn; + np += nn; + dp += dn; + + if (qn > dn) + { + /* Reduce qn mod dn without division, optimizing small operations. */ + do + qn -= dn; + while (qn > dn); + + qp -= qn; /* point at low limb of next quotient block */ + np -= qn; /* point in the middle of partial remainder */ + + /* Perform the typically smaller block first. */ + if (qn == 1) + { + mp_limb_t q, n2, n1, n0, d1, d0; + + /* Handle qh up front, for simplicity. */ + qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0; + if (qh) + ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn)); + + /* A single iteration of schoolbook: One 3/2 division, + followed by the bignum update and adjustment. */ + n2 = np[0]; + n1 = np[-1]; + n0 = np[-2]; + d1 = dp[-1]; + d0 = dp[-2]; + + ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0)); + + if (UNLIKELY (n2 == d1) && n1 == d0) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np - dn, dp - dn, dn, q); + ASSERT (cy == n2); + } + else + { + udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32); + + if (dn > 2) + { + mp_limb_t cy, cy1; + cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 = (n1 - cy1) & GMP_NUMB_MASK; + np[-2] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1); + qh -= (q == 0); + q = (q - 1) & GMP_NUMB_MASK; + } + } + else + np[-2] = n0; + + np[-1] = n1; + } + qp[0] = q; + } + else + { + /* Do a 2qn / qn division */ + if (qn == 2) + qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); /* FIXME: obsolete function. Use 5/3 division? */ + else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); + + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp - dn, dn - qn); + else + mpn_mul (tp, dp - dn, dn - qn, qp, qn); + + cy = mpn_sub_n (np - dn, np - dn, tp, dn); + if (qh != 0) + cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp, qp, qn, 1); + cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); + } + } + } + + qn = nn - dn - qn; + do + { + qp -= dn; + np -= dn; + mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp); + qn -= dn; + } + while (qn > 0); + } + else + { + qp -= qn; /* point at low limb of next quotient block */ + np -= qn; /* point in the middle of partial remainder */ + + if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); + + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp - dn, dn - qn); + else + mpn_mul (tp, dp - dn, dn - qn, qp, qn); + + cy = mpn_sub_n (np - dn, np - dn, tp, dn); + if (qh != 0) + cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp, qp, qn, 1); + cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); + } + } + } + + TMP_FREE; + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c b/gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c new file mode 100644 index 0000000..0abe04e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c @@ -0,0 +1,256 @@ +/* mpn_dcpi1_divappr_q -- divide-and-conquer division, returning approximate + quotient. The quotient returned is either correct, or one too large. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +static mp_limb_t +mpn_dcpi1_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, + gmp_pi1_t *dinv, mp_ptr tp) +{ + mp_size_t lo, hi; + mp_limb_t cy, qh, ql; + + lo = n >> 1; /* floor(n/2) */ + hi = n - lo; /* ceil(n/2) */ + + if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp); + + mpn_mul (tp, qp + lo, hi, dp, lo); + + cy = mpn_sub_n (np + lo, np + lo, tp, n); + if (qh != 0) + cy += mpn_sub_n (np + n, np + n, dp, lo); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1); + cy -= mpn_add_n (np + lo, np + lo, dp, n); + } + + if (BELOW_THRESHOLD (lo, DC_DIVAPPR_Q_THRESHOLD)) + ql = mpn_sbpi1_divappr_q (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32); + else + ql = mpn_dcpi1_divappr_q_n (qp, np + hi, dp + hi, lo, dinv, tp); + + if (UNLIKELY (ql != 0)) + { + mp_size_t i; + for (i = 0; i < lo; i++) + qp[i] = GMP_NUMB_MASK; + } + + return qh; +} + +mp_limb_t +mpn_dcpi1_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv) +{ + mp_size_t qn; + mp_limb_t qh, cy, qsave; + mp_ptr tp; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 6); + ASSERT (nn > dn); + ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); + + qn = nn - dn; + qp += qn; + np += nn; + dp += dn; + + if (qn >= dn) + { + qn++; /* pretend we'll need an extra limb */ + /* Reduce qn mod dn without division, optimizing small operations. */ + do + qn -= dn; + while (qn > dn); + + qp -= qn; /* point at low limb of next quotient block */ + np -= qn; /* point in the middle of partial remainder */ + + tp = TMP_SALLOC_LIMBS (dn); + + /* Perform the typically smaller block first. */ + if (qn == 1) + { + mp_limb_t q, n2, n1, n0, d1, d0; + + /* Handle qh up front, for simplicity. */ + qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0; + if (qh) + ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn)); + + /* A single iteration of schoolbook: One 3/2 division, + followed by the bignum update and adjustment. */ + n2 = np[0]; + n1 = np[-1]; + n0 = np[-2]; + d1 = dp[-1]; + d0 = dp[-2]; + + ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0)); + + if (UNLIKELY (n2 == d1) && n1 == d0) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np - dn, dp - dn, dn, q); + ASSERT (cy == n2); + } + else + { + udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32); + + if (dn > 2) + { + mp_limb_t cy, cy1; + cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 = (n1 - cy1) & GMP_NUMB_MASK; + np[-2] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1); + qh -= (q == 0); + q = (q - 1) & GMP_NUMB_MASK; + } + } + else + np[-2] = n0; + + np[-1] = n1; + } + qp[0] = q; + } + else + { + if (qn == 2) + qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); + else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); + + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp - dn, dn - qn); + else + mpn_mul (tp, dp - dn, dn - qn, qp, qn); + + cy = mpn_sub_n (np - dn, np - dn, tp, dn); + if (qh != 0) + cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp, qp, qn, 1); + cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); + } + } + } + qn = nn - dn - qn + 1; + while (qn > dn) + { + qp -= dn; + np -= dn; + mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp); + qn -= dn; + } + + /* Since we pretended we'd need an extra quotient limb before, we now + have made sure the code above left just dn-1=qn quotient limbs to + develop. Develop that plus a guard limb. */ + qn--; + qp -= qn; + np -= dn; + qsave = qp[qn]; + mpn_dcpi1_divappr_q_n (qp, np - dn, dp - dn, dn, dinv, tp); + MPN_COPY_INCR (qp, qp + 1, qn); + qp[qn] = qsave; + } + else /* (qn < dn) */ + { + mp_ptr q2p; +#if 0 /* not possible since we demand nn > dn */ + if (qn == 0) + { + qh = mpn_cmp (np - dn, dp - dn, dn) >= 0; + if (qh) + mpn_sub_n (np - dn, np - dn, dp - dn, dn); + TMP_FREE; + return qh; + } +#endif + + qp -= qn; /* point at low limb of next quotient block */ + np -= qn; /* point in the middle of partial remainder */ + + q2p = TMP_SALLOC_LIMBS (qn + 1); + /* Should we at all check DC_DIVAPPR_Q_THRESHOLD here, or reply on + callers not to be silly? */ + if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD)) + { + qh = mpn_sbpi1_divappr_q (q2p, np - qn - 2, 2 * (qn + 1), + dp - (qn + 1), qn + 1, dinv->inv32); + } + else + { + /* It is tempting to use qp for recursive scratch and put quotient in + tp, but the recursive scratch needs one limb too many. */ + tp = TMP_SALLOC_LIMBS (qn + 1); + qh = mpn_dcpi1_divappr_q_n (q2p, np - qn - 2, dp - (qn + 1), qn + 1, dinv, tp); + } + MPN_COPY (qp, q2p + 1, qn); + } + + TMP_FREE; + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/div_q.c b/gmp-6.3.0/mpn/generic/div_q.c new file mode 100644 index 0000000..18c4ecf --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_q.c @@ -0,0 +1,313 @@ +/* mpn_div_q -- division for arbitrary size operands. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2009, 2010, 2015, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Compute Q = N/D with truncation. + N = {np,nn} + D = {dp,dn} + Q = {qp,nn-dn+1} + T = {scratch,nn+1} is scratch space + N and D are both untouched by the computation. + N and T may overlap; pass the same space if N is irrelevant after the call, + but note that tp needs an extra limb. + + Operand requirements: + N >= D > 0 + dp[dn-1] != 0 + No overlap between the N, D, and Q areas. + + This division function does not clobber its input operands, since it is + intended to support average-O(qn) division, and for that to be effective, it + cannot put requirements on callers to copy a O(nn) operand. + + If a caller does not care about the value of {np,nn+1} after calling this + function, it should pass np also for the scratch argument. This function + will then save some time and space by avoiding allocation and copying. + (FIXME: Is this a good design? We only really save any copying for + already-normalised divisors, which should be rare. It also prevents us from + reasonably asking for all scratch space we need.) + + We write nn-dn+1 limbs for the quotient, but return void. Why not return + the most significant quotient limb? Look at the 4 main code blocks below + (consisting of an outer if-else where each arm contains an if-else). It is + tricky for the first code block, since the mpn_*_div_q calls will typically + generate all nn-dn+1 and return 0 or 1. I don't see how to fix that unless + we generate the most significant quotient limb here, before calling + mpn_*_div_q, or put the quotient in a temporary area. Since this is a + critical division case (the SB sub-case in particular) copying is not a good + idea. + + It might make sense to split the if-else parts of the (qn + FUDGE + >= dn) blocks into separate functions, since we could promise quite + different things to callers in these two cases. The 'then' case + benefits from np=scratch, and it could perhaps even tolerate qp=np, + saving some headache for many callers. + + FIXME: Scratch allocation leaves a lot to be desired. E.g., for the MU size + operands, we do not reuse the huge scratch for adjustments. This can be a + serious waste of memory for the largest operands. +*/ + +/* FUDGE determines when to try getting an approximate quotient from the upper + parts of the dividend and divisor, then adjust. N.B. FUDGE must be >= 2 + for the code to be correct. */ +#define FUDGE 5 /* FIXME: tune this */ + +#define DC_DIV_Q_THRESHOLD DC_DIVAPPR_Q_THRESHOLD +#define MU_DIV_Q_THRESHOLD MU_DIVAPPR_Q_THRESHOLD +#define MUPI_DIV_Q_THRESHOLD MUPI_DIVAPPR_Q_THRESHOLD +#ifndef MUPI_DIVAPPR_Q_THRESHOLD +#define MUPI_DIVAPPR_Q_THRESHOLD MUPI_DIV_QR_THRESHOLD +#endif + +void +mpn_div_q (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, mp_ptr scratch) +{ + mp_ptr new_dp, new_np, tp, rp; + mp_limb_t cy, dh, qh; + mp_size_t new_nn, qn; + gmp_pi1_t dinv; + int cnt; + TMP_DECL; + TMP_MARK; + + ASSERT (nn >= dn); + ASSERT (dn > 0); + ASSERT (dp[dn - 1] != 0); + ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn)); + ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn)); + ASSERT (MPN_SAME_OR_SEPARATE_P (np, scratch, nn)); + + ASSERT_ALWAYS (FUDGE >= 2); + + dh = dp[dn - 1]; + if (dn == 1) + { + mpn_divrem_1 (qp, 0L, np, nn, dh); + return; + } + + qn = nn - dn + 1; /* Quotient size, high limb might be zero */ + + if (qn + FUDGE >= dn) + { + /* |________________________| + |_______| */ + new_np = scratch; + + if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0)) + { + count_leading_zeros (cnt, dh); + + cy = mpn_lshift (new_np, np, nn, cnt); + new_np[nn] = cy; + new_nn = nn + (cy != 0); + + new_dp = TMP_ALLOC_LIMBS (dn); + mpn_lshift (new_dp, dp, dn, cnt); + + if (dn == 2) + { + qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp); + } + else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) || + BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD)) + { + invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]); + qh = mpn_sbpi1_div_q (qp, new_np, new_nn, new_dp, dn, dinv.inv32); + } + else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) || /* fast condition */ + BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */ + (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */ + + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn) /* ...condition */ + { + invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]); + qh = mpn_dcpi1_div_q (qp, new_np, new_nn, new_dp, dn, &dinv); + } + else + { + mp_size_t itch = mpn_mu_div_q_itch (new_nn, dn, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + qh = mpn_mu_div_q (qp, new_np, new_nn, new_dp, dn, scratch); + } + if (cy == 0) + qp[qn - 1] = qh; + else + ASSERT (qh == 0); + } + else /* divisor is already normalised */ + { + if (new_np != np) + MPN_COPY (new_np, np, nn); + + if (dn == 2) + { + qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp); + } + else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) || + BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD)) + { + invert_pi1 (dinv, dh, dp[dn - 2]); + qh = mpn_sbpi1_div_q (qp, new_np, nn, dp, dn, dinv.inv32); + } + else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) || /* fast condition */ + BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */ + (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */ + + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn) /* ...condition */ + { + invert_pi1 (dinv, dh, dp[dn - 2]); + qh = mpn_dcpi1_div_q (qp, new_np, nn, dp, dn, &dinv); + } + else + { + mp_size_t itch = mpn_mu_div_q_itch (nn, dn, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + qh = mpn_mu_div_q (qp, np, nn, dp, dn, scratch); + } + qp[nn - dn] = qh; + } + } + else + { + /* |________________________| + |_________________| */ + tp = TMP_ALLOC_LIMBS (qn + 1); + + new_np = scratch; + new_nn = 2 * qn + 1; + if (new_np == np) + /* We need {np,nn} to remain untouched until the final adjustment, so + we need to allocate separate space for new_np. */ + new_np = TMP_ALLOC_LIMBS (new_nn + 1); + + + if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0)) + { + count_leading_zeros (cnt, dh); + + cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt); + new_np[new_nn] = cy; + + new_nn += (cy != 0); + + new_dp = TMP_ALLOC_LIMBS (qn + 1); + mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt); + new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt); + + if (qn + 1 == 2) + { + qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp); + } + else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1)) + { + invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]); + qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32); + } + else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1)) + { + invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]); + qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv); + } + else + { + mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch); + } + if (cy == 0) + tp[qn] = qh; + else if (UNLIKELY (qh != 0)) + { + /* This happens only when the quotient is close to B^n and + mpn_*_divappr_q returned B^n. */ + mp_size_t i, n; + n = new_nn - (qn + 1); + for (i = 0; i < n; i++) + tp[i] = GMP_NUMB_MAX; + qh = 0; /* currently ignored */ + } + } + else /* divisor is already normalised */ + { + MPN_COPY (new_np, np + nn - new_nn, new_nn); /* pointless if MU will be used */ + + new_dp = (mp_ptr) dp + dn - (qn + 1); + + if (qn == 2 - 1) + { + qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp); + } + else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1)) + { + invert_pi1 (dinv, dh, new_dp[qn - 1]); + qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32); + } + else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1)) + { + invert_pi1 (dinv, dh, new_dp[qn - 1]); + qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv); + } + else + { + mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch); + } + tp[qn] = qh; + } + + MPN_COPY (qp, tp + 1, qn); + if (tp[0] <= 4) + { + mp_size_t rn; + + rp = TMP_ALLOC_LIMBS (dn + qn); + mpn_mul (rp, dp, dn, tp + 1, qn); + rn = dn + qn; + rn -= rp[rn - 1] == 0; + + if (rn > nn || mpn_cmp (np, rp, nn) < 0) + MPN_DECR_U (qp, qn, 1); + } + } + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_1.c b/gmp-6.3.0/mpn/generic/div_qr_1.c new file mode 100644 index 0000000..8f80d37 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1.c @@ -0,0 +1,125 @@ +/* mpn_div_qr_1 -- mpn by limb division. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund + +Copyright 1991, 1993, 1994, 1996, 1998-2000, 2002, 2003, 2013 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef DIV_QR_1_NORM_THRESHOLD +#define DIV_QR_1_NORM_THRESHOLD 3 +#endif +#ifndef DIV_QR_1_UNNORM_THRESHOLD +#define DIV_QR_1_UNNORM_THRESHOLD 3 +#endif + +#if GMP_NAIL_BITS > 0 +#error Nail bits not supported +#endif + +/* Divides {up, n} by d. Writes the n-1 low quotient limbs at {qp, + * n-1}, and the high quotient limb at *qh. Returns remainder. */ +mp_limb_t +mpn_div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t n, + mp_limb_t d) +{ + unsigned cnt; + mp_limb_t uh; + + ASSERT (n > 0); + ASSERT (d > 0); + + if (d & GMP_NUMB_HIGHBIT) + { + /* Normalized case */ + mp_limb_t dinv, q; + + uh = up[--n]; + + q = (uh >= d); + *qh = q; + uh -= (-q) & d; + + if (BELOW_THRESHOLD (n, DIV_QR_1_NORM_THRESHOLD)) + { + cnt = 0; + plain: + while (n > 0) + { + mp_limb_t ul = up[--n]; + udiv_qrnnd (qp[n], uh, uh, ul, d); + } + return uh >> cnt; + } + invert_limb (dinv, d); + return mpn_div_qr_1n_pi1 (qp, up, n, uh, d, dinv); + } + else + { + /* Unnormalized case */ + mp_limb_t dinv, ul; + + if (! UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (n, DIV_QR_1_UNNORM_THRESHOLD)) + { + uh = up[--n]; + udiv_qrnnd (*qh, uh, CNST_LIMB(0), uh, d); + cnt = 0; + goto plain; + } + + count_leading_zeros (cnt, d); + d <<= cnt; + +#if HAVE_NATIVE_mpn_div_qr_1u_pi1 + /* FIXME: Call loop doing on-the-fly normalization */ +#endif + + /* Shift up front, use qp area for shifted copy. A bit messy, + since we have only n-1 limbs available, and shift the high + limb manually. */ + uh = up[--n]; + ul = (uh << cnt) | mpn_lshift (qp, up, n, cnt); + uh >>= (GMP_LIMB_BITS - cnt); + + if (UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (n, DIV_QR_1_UNNORM_THRESHOLD)) + { + udiv_qrnnd (*qh, uh, uh, ul, d); + up = qp; + goto plain; + } + invert_limb (dinv, d); + + udiv_qrnnd_preinv (*qh, uh, uh, ul, d, dinv); + return mpn_div_qr_1n_pi1 (qp, qp, n, uh, d, dinv) >> cnt; + } +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c b/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c new file mode 100644 index 0000000..4977131 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c @@ -0,0 +1,505 @@ +/* mpn_div_qr_1n_pi1 + + Contributed to the GNU project by Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#if GMP_NAIL_BITS > 0 +#error Nail bits not supported +#endif + +#ifndef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 2 +#endif + +/* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */ + +#if defined (__GNUC__) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %k2\n\t" \ + "adc %4, %k1\n\t" \ + "sbb %k0, %k0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %q2\n\t" \ + "adc %4, %q1\n\t" \ + "sbb %q0, %q0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxcc %r3, %4, %1\n\t" \ + "subx %%g0, %%g0, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addccc %r7, %8, %%g0\n\t" \ + "addccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ + "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#if __VIS__ >= 0x300 +#undef add_mssaaaa +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add%I6c %2, %5, %6\n\t" \ + "adde %1, %3, %4\n\t" \ + "subfe %0, %0, %0\n\t" \ + "nor %0, %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if defined (__s390x__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "algr %2, %6\n\t" \ + "alcgr %1, %4\n\t" \ + "lghi %0, 0\n\t" \ + "alcgr %0, %0\n\t" \ + "lcgr %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC) +#endif + +#if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %5, %6\n\t" \ + "adcs %1, %3, %4\n\t" \ + "movcc %0, #0\n\t" \ + "movcs %0, #-1" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %x5, %6\n\t" \ + "adcs %1, %x3, %x4\n\t" \ + "csinv %0, xzr, xzr, cc\n\t" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC) +#endif +#endif /* defined (__GNUC__) */ + +#ifndef add_mssaaaa +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (m) = - (__c1 + (__s1 < __c0)); \ + } while (0) +#endif + +#if DIV_QR_1N_METHOD == 1 + +/* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}. + Requires that uh < d. */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh, + mp_limb_t d, mp_limb_t dinv) +{ + ASSERT (n > 0); + ASSERT (uh < d); + ASSERT (d & GMP_NUMB_HIGHBIT); + ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n)); + + do + { + mp_limb_t q, ul; + + ul = up[--n]; + udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv); + qp[n] = q; + } + while (n > 0); + + return uh; +} + +#elif DIV_QR_1N_METHOD == 2 + +/* The main idea of this algorithm is to write B^2 = d (B + dinv) + + B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration + can then replace + + u1 B^2 = u1 B2 (mod d) + + which gives a very short critical path for computing the remainder + (with some tricks to handle the carry when the next two lower limbs + are added in). To also get the quotient, include the corresponding + multiple of d in the expression, + + u1 B^2 = u1 B2 + (u1 dinv + u1 B) d + + We get the quotient by accumulating the (u1 dinv + u1 B) terms. The + two multiplies, u1 * B2 and u1 * dinv, are independent, and can be + executed in parallel. + */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t u0, u2; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + u0 = up[n-1]; /* Early read, to allow qp == up. */ + qp[n-1] = q1; + + add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0); + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + + /* Additions for the q update: + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+---+ + * | 1 | v | (conditional on u2) + * +---+---+ + * | 1 | (conditional on u0 + u2 B2 carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + */ + umul_ppmm (p1, t, u1, dinv); + ADDC_LIMB (cy, u0, u0, u2 & B2); + u0 -= (-cy) & d; + add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1); + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0); + q0 = t; + + /* Note that p1 + cy cannot overflow */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy); + + umul_ppmm (p1, p0, u1, B2); + + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + + add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0); + } + + q1 = (u2 > 0); + u1 -= (-q1) & d; + + t = (u1 >= d); + q1 += t; + u1 -= (-t) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} + +#elif DIV_QR_1N_METHOD == 3 + +/* This variant handles carry from the u update earlier. This gives a + longer critical path, but reduces the work needed for the + quotients. */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t cy, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + u0 = up[n-1]; /* Early read, to allow qp == up. */ + + add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0); + u1 -= cy & d; + q1 -= cy; + qp[n-1] = q1; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update: + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update: + * +-------+ + * |u1 * B2| + * +---+---+ + * + |u0 |u-1| + * +---+---+ + * - | d | (conditional on carry) + * ---+---+---+ + * |u1 | u0| + * +---+---+ + * + */ + umul_ppmm (p1, p0, u1, B2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0); + u1 -= cy & d; + + /* t1 <= B-2, so cy can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} + +#elif DIV_QR_1N_METHOD == 4 + +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t u2, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t B2d0, B2d1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + /* B2 * (B-d) */ + umul_ppmm (B2d1, B2d0, B2, -d); + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + + add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0); + + /* After read of up[n-1], to allow qp == up. */ + qp[n-1] = q1 - u2; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update. *After* u1 -= u2 & d adjustment. + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update. *Before* u1 -= u2 & d adjstment. + * +-------+ + * |u1 * B2| + * +---+---+ + * |u0 |u-1| + * +---+---+ + + + |B2(B-d)| (conditional on u2) + * -+---+---+---+ + * |u2 |u1 | u0| + * +---+---+---+ + * + */ + /* Multiply with unadjusted u1, to shorten critical path. */ + umul_ppmm (p1, p0, u1, B2); + u1 -= (d & u2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + + add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0); + add_mssaaaa (u2, u1, u0, u1, u0, p1, p0); + u2 += cy; + ASSERT(-u2 <= 1); + + /* t1 <= B-2, so u2 can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + u1 -= u2 & d; + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} +#else +#error Unknown DIV_QR_1N_METHOD +#endif diff --git a/gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c b/gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c new file mode 100644 index 0000000..daae68f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c @@ -0,0 +1,203 @@ +/* mpn_div_qr_1n_pi2. + + THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS + ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2013, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* ISSUES: + + * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv? + + * Are there any problems with generating n quotient limbs in the q area? It + surely simplifies things. + + * Not yet adequately tested. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Define some longlong.h-style macros, but for wider operations. + * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into + an additional sum operand. +*/ +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((USItype)(s2)), \ + "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((UDItype)(s2)), \ + "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#endif /* __GNUC__ */ + +#ifndef add_sssaaaa +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (s2) += __c1 + (__s1 < __c0); \ + } while (0) +#endif + +struct precomp_div_1_pi2 +{ + mp_limb_t dip[2]; + mp_limb_t d; + int norm_cnt; +}; + +mp_limb_t +mpn_div_qr_1n_pi2 (mp_ptr qp, + mp_srcptr up, mp_size_t un, + struct precomp_div_1_pi2 *pd) +{ + mp_limb_t most_significant_q_limb; + mp_size_t i; + mp_limb_t r, u2, u1, u0; + mp_limb_t d0, di1, di0; + mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d; + mp_limb_t cnd; + + ASSERT (un >= 2); + ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0); + ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up); + ASSERT_MPN (up, un); + +#define q3 q3a +#define q2 q2b +#define q1 q1b + + up += un - 3; + r = up[2]; + d0 = pd->d; + + most_significant_q_limb = (r >= d0); + r -= d0 & -most_significant_q_limb; + + qp += un - 3; + qp[2] = most_significant_q_limb; + + di1 = pd->dip[1]; + di0 = pd->dip[0]; + + for (i = un - 3; i >= 0; i -= 2) + { + u2 = r; + u1 = up[1]; + u0 = up[0]; + + /* Dividend in {r,u1,u0} */ + + umul_ppmm (q1d,q0d, u1, di0); + umul_ppmm (q2b,q1b, u1, di1); + q2b++; /* cannot spill */ + add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); + + umul_ppmm (q2c,q1c, u2, di0); + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); + umul_ppmm (q3a,q2a, u2, di1); + + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); + + q3 += r; + + r = u0 - q2 * d0; + + cnd = (r >= q1); + r += d0 & -cnd; + sub_ddmmss (q3,q2, q3,q2, 0,cnd); + + if (UNLIKELY (r >= d0)) + { + r -= d0; + add_ssaaaa (q3,q2, q3,q2, 0,1); + } + + qp[0] = q2; + qp[1] = q3; + + up -= 2; + qp -= 2; + } + + if ((un & 1) == 0) + { + u2 = r; + u1 = up[1]; + + udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1); + qp[1] = q3; + } + + return r; + +#undef q3 +#undef q2 +#undef q1 +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c b/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c new file mode 100644 index 0000000..ea38e3c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c @@ -0,0 +1,236 @@ +/* mpn_div_qr_1u_pi2. + + THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS + ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2013, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* ISSUES: + + * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv? + + * Are there any problems with generating n quotient limbs in the q area? It + surely simplifies things. + + * Not yet adequately tested. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Define some longlong.h-style macros, but for wider operations. + * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into + an additional sum operand. +*/ +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((USItype)(s2)), \ + "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((UDItype)(s2)), \ + "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#endif /* __GNUC__ */ + +#ifndef add_sssaaaa +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (s2) += __c1 + (__s1 < __c0); \ + } while (0) +#endif + +struct precomp_div_1_pi2 +{ + mp_limb_t dip[2]; + mp_limb_t d; + int norm_cnt; +}; + +mp_limb_t +mpn_div_qr_1u_pi2 (mp_ptr qp, + mp_srcptr up, mp_size_t un, + struct precomp_div_1_pi2 *pd) +{ + mp_size_t i; + mp_limb_t r, u2, u1, u0; + mp_limb_t d0, di1, di0; + mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d; + mp_limb_t cnd; + int cnt; + + ASSERT (un >= 2); + ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0); + ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up); + ASSERT_MPN (up, un); + +#define q3 q3a +#define q2 q2b +#define q1 q1b + + up += un - 3; + cnt = pd->norm_cnt; + r = up[2] >> (GMP_NUMB_BITS - cnt); + d0 = pd->d << cnt; + + qp += un - 2; + + di1 = pd->dip[1]; + di0 = pd->dip[0]; + + for (i = un - 3; i >= 0; i -= 2) + { + u2 = r; + u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt)); + u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt)); + + /* Dividend in {r,u1,u0} */ + + umul_ppmm (q1d,q0d, u1, di0); + umul_ppmm (q2b,q1b, u1, di1); + q2b++; /* cannot spill */ + add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); + + umul_ppmm (q2c,q1c, u2, di0); + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); + umul_ppmm (q3a,q2a, u2, di1); + + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); + + q3 += r; + + r = u0 - q2 * d0; + + cnd = (r >= q1); + r += d0 & -cnd; + sub_ddmmss (q3,q2, q3,q2, 0,cnd); + + if (UNLIKELY (r >= d0)) + { + r -= d0; + add_ssaaaa (q3,q2, q3,q2, 0,1); + } + + qp[0] = q2; + qp[1] = q3; + + up -= 2; + qp -= 2; + } + + if ((un & 1) != 0) + { + u2 = r; + u1 = (up[2] << cnt); + + udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1); + qp[1] = q3; + } + else + { + u2 = r; + u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt)); + u0 = (up[1] << cnt); + + /* Dividend in {r,u1,u0} */ + + umul_ppmm (q1d,q0d, u1, di0); + umul_ppmm (q2b,q1b, u1, di1); + q2b++; /* cannot spill */ + add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); + + umul_ppmm (q2c,q1c, u2, di0); + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); + umul_ppmm (q3a,q2a, u2, di1); + + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); + + q3 += r; + + r = u0 - q2 * d0; + + cnd = (r >= q1); + r += d0 & -cnd; + sub_ddmmss (q3,q2, q3,q2, 0,cnd); + + if (UNLIKELY (r >= d0)) + { + r -= d0; + add_ssaaaa (q3,q2, q3,q2, 0,1); + } + + qp[0] = q2; + qp[1] = q3; + } + + return r >> cnt; + +#undef q3 +#undef q2 +#undef q1 +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_2.c b/gmp-6.3.0/mpn/generic/div_qr_2.c new file mode 100644 index 0000000..c3c8f57 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_2.c @@ -0,0 +1,314 @@ +/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and + quotient. The divisor is two limbs. + + Contributed to the GNU project by Torbjorn Granlund and Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef DIV_QR_2_PI2_THRESHOLD +/* Disabled unless explicitly tuned. */ +#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX +#endif + +#ifndef SANITY_CHECK +#define SANITY_CHECK 0 +#endif + +/* Define some longlong.h-style macros, but for wider operations. + * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into + an additional sum operand. + * add_csaac accepts two addends and a carry in, and generates a sum and a + carry out. A little like a "full adder". +*/ +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((USItype)(s2)), \ + "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((UDItype)(s2)), \ + "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#endif /* __GNUC__ */ + +#ifndef add_sssaaaa +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (s2) += __c1 + (__s1 < __c0); \ + } while (0) +#endif + +/* Typically used with r1, r0 same as n3, n2. Other types of overlap + between inputs and outputs are not supported. */ +#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \ + do { \ + mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \ + mp_limb_t _t1, _t0; \ + mp_limb_t _mask; \ + \ + /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \ + umul_ppmm (_q2,_q1, n2, di1); \ + umul_ppmm (_q3,_q2a, n3, di1); \ + ++_q2; /* _q2 cannot overflow */ \ + add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \ + umul_ppmm (_q2c,_q1c, n3, di0); \ + add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \ + umul_ppmm (_q1d,_q0, n2, di0); \ + add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \ + add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \ + \ + umul_ppmm (_t1,_t0, _q2, d0); \ + _t1 += _q2 * d1 + _q3 * d0; \ + \ + sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \ + \ + _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \ + add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \ + sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \ + \ + if (UNLIKELY (r1 >= d1)) \ + { \ + if (r1 > d1 || r0 >= d0) \ + { \ + sub_ddmmss (r1, r0, r1, r0, d1, d0); \ + add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\ + } \ + } \ + (q1) = _q3; \ + (q0) = _q2; \ + } while (0) + +static void +invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0) +{ + mp_limb_t v1, v0, p1, t1, t0, p0, mask; + invert_limb (v1, d1); + p1 = d1 * v1; + /* <1, v1> * d1 = */ + p1 += d0; + if (p1 < d0) + { + v1--; + mask = -(mp_limb_t) (p1 >= d1); + p1 -= d1; + v1 += mask; + p1 -= mask & d1; + } + /* <1, v1> * d1 + d0 = */ + umul_ppmm (t1, p0, d0, v1); + p1 += t1; + if (p1 < t1) + { + if (UNLIKELY (p1 >= d1)) + { + if (p1 > d1 || p0 >= d0) + { + sub_ddmmss (p1, p0, p1, p0, d1, d0); + v1--; + } + } + sub_ddmmss (p1, p0, p1, p0, d1, d0); + v1--; + } + /* Now v1 is the 3/2 inverse, <1, v1> * = , + * with + >= B^2. + * + * The 4/2 inverse is (B^4 - 1) / = <1, v1, v0>. The + * partial remainder after <1, v1> is + * + * B^4 - 1 - B <1, v1> = - + * = <~p1, ~p0, B-1> + */ + udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1); + di[0] = v0; + di[1] = v1; + +#if SANITY_CHECK + { + mp_limb_t tp[4]; + mp_limb_t dp[2]; + dp[0] = d0; + dp[1] = d1; + mpn_mul_n (tp, dp, di, 2); + ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0); + ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX); + ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX); + ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1); + } +#endif +} + +static mp_limb_t +mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0) +{ + mp_limb_t qh; + mp_size_t i; + mp_limb_t r1, r0; + + ASSERT (nn >= 2); + ASSERT (d1 & GMP_NUMB_HIGHBIT); + + r1 = np[nn-1]; + r0 = np[nn-2]; + + qh = 0; + if (r1 >= d1 && (r1 > d1 || r0 >= d0)) + { +#if GMP_NAIL_BITS == 0 + sub_ddmmss (r1, r0, r1, r0, d1, d0); +#else + r0 = r0 - d0; + r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); + r0 &= GMP_NUMB_MASK; +#endif + qh = 1; + } + + for (i = nn - 2; i >= 2; i -= 2) + { + mp_limb_t n1, n0, q1, q0; + n1 = np[i-1]; + n0 = np[i-2]; + udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0); + qp[i-1] = q1; + qp[i-2] = q0; + } + + if (i > 0) + { + mp_limb_t q; + udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1); + qp[0] = q; + } + rp[1] = r1; + rp[0] = r0; + + return qh; +} + + +/* Divide num {np,nn} by den {dp,2} and write the nn-2 least + significant quotient limbs at qp and the 2 long remainder at np. + Return the most significant limb of the quotient. + + Preconditions: + 1. qp must either not overlap with the other operands at all, or + qp >= np + 2 must hold true. (This means that it's possible to put + the quotient in the high part of {np,nn}, right above the remainder.) + 2. nn >= 2. */ + +mp_limb_t +mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_srcptr dp) +{ + mp_limb_t d1; + mp_limb_t d0; + gmp_pi1_t dinv; + + ASSERT (nn >= 2); + ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2); + ASSERT_MPN (np, nn); + ASSERT_MPN (dp, 2); + + d1 = dp[1]; d0 = dp[0]; + + ASSERT (d1 > 0); + + if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT)) + { + if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD)) + { + gmp_pi1_t dinv; + invert_pi1 (dinv, d1, d0); + return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32); + } + else + { + mp_limb_t di[2]; + invert_4by2 (di, d1, d0); + return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]); + } + } + else + { + int shift; + count_leading_zeros (shift, d1); + d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift)); + d0 <<= shift; + invert_pi1 (dinv, d1, d0); + return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32); + } +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c b/gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c new file mode 100644 index 0000000..131a811 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c @@ -0,0 +1,84 @@ +/* mpn_div_qr_2n_pi1 + + Contributed to the GNU project by Torbjorn Granlund and Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* 3/2 loop, for normalized divisor */ +mp_limb_t +mpn_div_qr_2n_pi1 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_limb_t d1, mp_limb_t d0, mp_limb_t di) +{ + mp_limb_t qh; + mp_size_t i; + mp_limb_t r1, r0; + + ASSERT (nn >= 2); + ASSERT (d1 & GMP_NUMB_HIGHBIT); + + np += nn - 2; + r1 = np[1]; + r0 = np[0]; + + qh = 0; + if (r1 >= d1 && (r1 > d1 || r0 >= d0)) + { +#if GMP_NAIL_BITS == 0 + sub_ddmmss (r1, r0, r1, r0, d1, d0); +#else + r0 = r0 - d0; + r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); + r0 &= GMP_NUMB_MASK; +#endif + qh = 1; + } + + for (i = nn - 2 - 1; i >= 0; i--) + { + mp_limb_t n0, q; + n0 = np[-1]; + udiv_qr_3by2 (q, r1, r0, r1, r0, n0, d1, d0, di); + np--; + qp[i] = q; + } + + rp[1] = r1; + rp[0] = r0; + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c b/gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c new file mode 100644 index 0000000..70e617b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c @@ -0,0 +1,76 @@ +/* mpn_div_qr_2u_pi1 + + Contributed to the GNU project by Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* 3/2 loop, for unnormalized divisor. Caller must pass shifted d1 and + d0, while {np,nn} is shifted on the fly. */ +mp_limb_t +mpn_div_qr_2u_pi1 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_limb_t d1, mp_limb_t d0, int shift, mp_limb_t di) +{ + mp_limb_t qh; + mp_limb_t r2, r1, r0; + mp_size_t i; + + ASSERT (nn >= 2); + ASSERT (d1 & GMP_NUMB_HIGHBIT); + ASSERT (shift > 0); + + r2 = np[nn-1] >> (GMP_LIMB_BITS - shift); + r1 = (np[nn-1] << shift) | (np[nn-2] >> (GMP_LIMB_BITS - shift)); + r0 = np[nn-2] << shift; + + udiv_qr_3by2 (qh, r2, r1, r2, r1, r0, d1, d0, di); + + for (i = nn - 2 - 1; i >= 0; i--) + { + mp_limb_t q; + r0 = np[i]; + r1 |= r0 >> (GMP_LIMB_BITS - shift); + r0 <<= shift; + udiv_qr_3by2 (q, r2, r1, r2, r1, r0, d1, d0, di); + qp[i] = q; + } + + rp[0] = (r1 >> shift) | (r2 << (GMP_LIMB_BITS - shift)); + rp[1] = r2 >> shift; + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/dive_1.c b/gmp-6.3.0/mpn/generic/dive_1.c new file mode 100644 index 0000000..056f5b9 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dive_1.c @@ -0,0 +1,146 @@ +/* mpn_divexact_1 -- mpn by limb exact division. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003, 2005, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + + +/* Divide a={src,size} by d=divisor and store the quotient in q={dst,size}. + q will only be correct if d divides a exactly. + + A separate loop is used for shift==0 because n<s)" and let the caller do a final umul if interested. + + When the divisor is even, the factors of two could be handled with a + separate mpn_rshift, instead of shifting on the fly. That might be + faster on some CPUs and would mean just the shift==0 style loop would be + needed. + + If n<= 1); + ASSERT (divisor != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); + ASSERT_MPN (src, size); + ASSERT_LIMB (divisor); + + if ((divisor & 1) == 0) + { + count_trailing_zeros (shift, divisor); + divisor >>= shift; + } + else + shift = 0; + + binvert_limb (inverse, divisor); + divisor <<= GMP_NAIL_BITS; + + if (shift != 0) + { + c = 0; + + s = src[0]; + + for (i = 1; i < size; i++) + { + s_next = src[i]; + ls = ((s >> shift) | (s_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK; + s = s_next; + + SUBC_LIMB (c, l, ls, c); + + l = (l * inverse) & GMP_NUMB_MASK; + dst[i - 1] = l; + + umul_ppmm (h, dummy, l, divisor); + c += h; + } + + ls = s >> shift; + l = ls - c; + l = (l * inverse) & GMP_NUMB_MASK; + dst[size - 1] = l; + } + else + { + s = src[0]; + + l = (s * inverse) & GMP_NUMB_MASK; + dst[0] = l; + c = 0; + + for (i = 1; i < size; i++) + { + umul_ppmm (h, dummy, l, divisor); + c += h; + + s = src[i]; + SUBC_LIMB (c, l, s, c); + + l = (l * inverse) & GMP_NUMB_MASK; + dst[i] = l; + } + } +} diff --git a/gmp-6.3.0/mpn/generic/diveby3.c b/gmp-6.3.0/mpn/generic/diveby3.c new file mode 100644 index 0000000..7dee0bc --- /dev/null +++ b/gmp-6.3.0/mpn/generic/diveby3.c @@ -0,0 +1,173 @@ +/* mpn_divexact_by3c -- mpn exact division by 3. + +Copyright 2000-2003, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if DIVEXACT_BY3_METHOD == 0 + +mp_limb_t +mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c) +{ + mp_limb_t r; + r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c); + + /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3. + We want to return C. We compute the remainder mod 4 and notice that the + inverse of (2^(2k)-1)/3 mod 4 is 1. */ + return r & 3; +} + +#endif + +#if DIVEXACT_BY3_METHOD == 1 + +/* The algorithm here is basically the same as mpn_divexact_1, as described + in the manual. Namely at each step q = (src[i]-c)*inverse, and new c = + borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3, + high(divisor*q) can be determined with two comparisons instead of a + multiply. + + The "c += ..."s add the high limb of 3*l to c. That high limb will be 0, + 1 or 2. Doing two separate "+="s seems to give better code on gcc (as of + 2.95.2 at least). + + It will be noted that the new c is formed by adding three values each 0 + or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c + causes a borrow, that leaves a limb value of either 0xFF...FF or + 0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or + 0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1 + respectively, hence a total of no more than 2. + + Alternatives: + + This implementation has each multiply on the dependent chain, due to + "l=s-c". See below for alternative code which avoids that. */ + +mp_limb_t +mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c) +{ + mp_limb_t l, q, s; + mp_size_t i; + + ASSERT (un >= 1); + ASSERT (c == 0 || c == 1 || c == 2); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); + + i = 0; + do + { + s = up[i]; + SUBC_LIMB (c, l, s, c); + + q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; + rp[i] = q; + + c += (q >= GMP_NUMB_CEIL_MAX_DIV3); + c += (q >= GMP_NUMB_CEIL_2MAX_DIV3); + } + while (++i < un); + + ASSERT (c == 0 || c == 1 || c == 2); + return c; +} + + +#endif + +#if DIVEXACT_BY3_METHOD == 2 + +/* The following alternative code re-arranges the quotient calculation from + (src[i]-c)*inverse to instead + + q = src[i]*inverse - c*inverse + + thereby allowing src[i]*inverse to be scheduled back as far as desired, + making full use of multiplier throughput and leaving just some carry + handing on the dependent chain. + + The carry handling consists of determining the c for the next iteration. + This is the same as described above, namely look for any borrow from + src[i]-c, and at the high of 3*q. + + high(3*q) is done with two comparisons as above (in c2 and c3). The + borrow from src[i]-c is incorporated into those by noting that if there's + a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn + giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is + enough to make high(3*q) come out 1 bigger, as required. + + l = -c*inverse is calculated at the same time as c, since for most chips + it can be more conveniently derived from separate c1/c2/c3 values than + from a combined c equal to 0, 1 or 2. + + The net effect is that with good pipelining this loop should be able to + run at perhaps 4 cycles/limb, depending on available execute resources + etc. + + Usage: + + This code is not used by default, since we really can't rely on the + compiler generating a good software pipeline, nor on such an approach + even being worthwhile on all CPUs. + + Itanium is one chip where this algorithm helps though, see + mpn/ia64/diveby3.asm. */ + +mp_limb_t +mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy) +{ + mp_limb_t s, sm, cl, q, qx, c2, c3; + mp_size_t i; + + ASSERT (un >= 1); + ASSERT (cy == 0 || cy == 1 || cy == 2); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); + + cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3; + + for (i = 0; i < un; i++) + { + s = up[i]; + sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; + + q = (cl + sm) & GMP_NUMB_MASK; + rp[i] = q; + qx = q + (s < cy); + + c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3; + c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ; + + cy = c2 + c3; + cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3); + } + + return cy; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/divexact.c b/gmp-6.3.0/mpn/generic/divexact.c new file mode 100644 index 0000000..ec417df --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divexact.c @@ -0,0 +1,296 @@ +/* mpn_divexact(qp,np,nn,dp,dn,tp) -- Divide N = {np,nn} by D = {dp,dn} storing + the result in Q = {qp,nn-dn+1} expecting no remainder. Overlap allowed + between Q and N; all other overlap disallowed. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +#if 1 +void +mpn_divexact (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) +{ + unsigned shift; + mp_size_t qn; + mp_ptr tp; + TMP_DECL; + + ASSERT (dn > 0); + ASSERT (nn >= dn); + ASSERT (dp[dn-1] > 0); + + while (dp[0] == 0) + { + ASSERT (np[0] == 0); + dp++; + np++; + dn--; + nn--; + } + + if (dn == 1) + { + MPN_DIVREM_OR_DIVEXACT_1 (qp, np, nn, dp[0]); + return; + } + + TMP_MARK; + + qn = nn + 1 - dn; + count_trailing_zeros (shift, dp[0]); + + if (shift > 0) + { + mp_ptr wp; + mp_size_t ss; + ss = (dn > qn) ? qn + 1 : dn; + + tp = TMP_ALLOC_LIMBS (ss); + mpn_rshift (tp, dp, ss, shift); + dp = tp; + + /* Since we have excluded dn == 1, we have nn > qn, and we need + to shift one limb beyond qn. */ + wp = TMP_ALLOC_LIMBS (qn + 1); + mpn_rshift (wp, np, qn + 1, shift); + np = wp; + } + + if (dn > qn) + dn = qn; + + tp = TMP_ALLOC_LIMBS (mpn_bdiv_q_itch (qn, dn)); + mpn_bdiv_q (qp, np, qn, dp, dn, tp); + TMP_FREE; + + /* Since bdiv_q computes -N/D (mod B^{qn}), we must negate now. */ + mpn_neg (qp, qp, qn); +} + +#else + +/* We use the Jebelean's bidirectional exact division algorithm. This is + somewhat naively implemented, with equal quotient parts done by 2-adic + division and truncating division. Since 2-adic division is faster, it + should be used for a larger chunk. + + This code is horrendously ugly, in all sorts of ways. + + * It was hacked without much care or thought, but with a testing program. + * It handles scratch space frivolously, and furthermore the itch function + is broken. + * Doesn't provide any measures to deal with mu_divappr_q's +3 error. We + have yet to provoke an error due to this, though. + * Algorithm selection leaves a lot to be desired. In particular, the choice + between DC and MU isn't a point, but we treat it like one. + * It makes the msb part 1 or 2 limbs larger than the lsb part, in spite of + that the latter is faster. We should at least reverse this, but perhaps + we should make the lsb part considerably larger. (How do we tune this?) +*/ + +mp_size_t +mpn_divexact_itch (mp_size_t nn, mp_size_t dn) +{ + return nn + dn; /* FIXME this is not right */ +} + +void +mpn_divexact (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + mp_size_t nn0, qn0; + mp_size_t nn1, qn1; + mp_ptr tp; + mp_limb_t qml; + mp_limb_t qh; + int cnt; + mp_ptr xdp; + mp_limb_t di; + mp_limb_t cy; + gmp_pi1_t dinv; + TMP_DECL; + + TMP_MARK; + + qn = nn - dn + 1; + + /* For small divisors, and small quotients, don't use Jebelean's algorithm. */ + if (dn < DIVEXACT_JEB_THRESHOLD || qn < DIVEXACT_JEB_THRESHOLD) + { + tp = scratch; + MPN_COPY (tp, np, qn); + binvert_limb (di, dp[0]); di = -di; + dn = MIN (dn, qn); + mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di); + TMP_FREE; + return; + } + + qn0 = ((nn - dn) >> 1) + 1; /* low quotient size */ + + /* If quotient is much larger than the divisor, the bidirectional algorithm + does not work as currently implemented. Fall back to plain bdiv. */ + if (qn0 > dn) + { + if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD)) + { + tp = scratch; + MPN_COPY (tp, np, qn); + binvert_limb (di, dp[0]); di = -di; + dn = MIN (dn, qn); + mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di); + } + else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD)) + { + tp = scratch; + MPN_COPY (tp, np, qn); + binvert_limb (di, dp[0]); di = -di; + mpn_dcpi1_bdiv_q (qp, tp, qn, dp, dn, di); + } + else + { + mpn_mu_bdiv_q (qp, np, qn, dp, dn, scratch); + } + TMP_FREE; + return; + } + + nn0 = qn0 + qn0; + + nn1 = nn0 - 1 + ((nn-dn) & 1); + qn1 = qn0; + if (LIKELY (qn0 != dn)) + { + nn1 = nn1 + 1; + qn1 = qn1 + 1; + if (UNLIKELY (dp[dn - 1] == 1 && qn1 != dn)) + { + /* If the leading divisor limb == 1, i.e. has just one bit, we have + to include an extra limb in order to get the needed overlap. */ + /* FIXME: Now with the mu_divappr_q function, we should really need + more overlap. That indicates one of two things: (1) The test code + is not good. (2) We actually overlap too much by default. */ + nn1 = nn1 + 1; + qn1 = qn1 + 1; + } + } + + tp = TMP_ALLOC_LIMBS (nn1 + 1); + + count_leading_zeros (cnt, dp[dn - 1]); + + /* Normalize divisor, store into tmp area. */ + if (cnt != 0) + { + xdp = TMP_ALLOC_LIMBS (qn1); + mpn_lshift (xdp, dp + dn - qn1, qn1, cnt); + } + else + { + xdp = (mp_ptr) dp + dn - qn1; + } + + /* Shift dividend according to the divisor normalization. */ + /* FIXME: We compute too much here for XX_divappr_q, but these functions' + interfaces want a pointer to the imaginative least significant limb, not + to the least significant *used* limb. Of course, we could leave nn1-qn1 + rubbish limbs in the low part, to save some time. */ + if (cnt != 0) + { + cy = mpn_lshift (tp, np + nn - nn1, nn1, cnt); + if (cy != 0) + { + tp[nn1] = cy; + nn1++; + } + } + else + { + /* FIXME: This copy is not needed for mpn_mu_divappr_q, except when the + mpn_sub_n right before is executed. */ + MPN_COPY (tp, np + nn - nn1, nn1); + } + + invert_pi1 (dinv, xdp[qn1 - 1], xdp[qn1 - 2]); + if (BELOW_THRESHOLD (qn1, DC_DIVAPPR_Q_THRESHOLD)) + { + qp[qn0 - 1 + nn1 - qn1] = mpn_sbpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, dinv.inv32); + } + else if (BELOW_THRESHOLD (qn1, MU_DIVAPPR_Q_THRESHOLD)) + { + qp[qn0 - 1 + nn1 - qn1] = mpn_dcpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, &dinv); + } + else + { + /* FIXME: mpn_mu_divappr_q doesn't handle qh != 0. Work around it with a + conditional subtraction here. */ + qh = mpn_cmp (tp + nn1 - qn1, xdp, qn1) >= 0; + if (qh) + mpn_sub_n (tp + nn1 - qn1, tp + nn1 - qn1, xdp, qn1); + mpn_mu_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, scratch); + qp[qn0 - 1 + nn1 - qn1] = qh; + } + qml = qp[qn0 - 1]; + + binvert_limb (di, dp[0]); di = -di; + + if (BELOW_THRESHOLD (qn0, DC_BDIV_Q_THRESHOLD)) + { + MPN_COPY (tp, np, qn0); + mpn_sbpi1_bdiv_q (qp, tp, qn0, dp, qn0, di); + } + else if (BELOW_THRESHOLD (qn0, MU_BDIV_Q_THRESHOLD)) + { + MPN_COPY (tp, np, qn0); + mpn_dcpi1_bdiv_q (qp, tp, qn0, dp, qn0, di); + } + else + { + mpn_mu_bdiv_q (qp, np, qn0, dp, qn0, scratch); + } + + if (qml < qp[qn0 - 1]) + mpn_decr_u (qp + qn0, 1); + + TMP_FREE; +} +#endif diff --git a/gmp-6.3.0/mpn/generic/divis.c b/gmp-6.3.0/mpn/generic/divis.c new file mode 100644 index 0000000..f989ddb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divis.c @@ -0,0 +1,194 @@ +/* mpn_divisible_p -- mpn by mpn divisibility test + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2014, 2017, 2018 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Determine whether A={ap,an} is divisible by D={dp,dn}. Must have both + operands normalized, meaning high limbs non-zero, except that an==0 is + allowed. + + There usually won't be many low zero bits on D, but the checks for this + are fast and might pick up a few operand combinations, in particular they + might reduce D to fit the single-limb mod_1/modexact_1 code. + + Future: + + Getting the remainder limb by limb would make an early exit possible on + finding a non-zero. This would probably have to be bdivmod style so + there's no addback, but it would need a multi-precision inverse and so + might be slower than the plain method (on small sizes at least). + + When D must be normalized (shifted to low bit set), it's possible to + suppress the bit-shifting of A down, as long as it's already been checked + that A has at least as many trailing zero bits as D. */ + +int +mpn_divisible_p (mp_srcptr ap, mp_size_t an, + mp_srcptr dp, mp_size_t dn) +{ + mp_limb_t alow, dlow, dmask; + mp_ptr qp, rp, tp; + mp_limb_t di; + unsigned twos; + int c; + TMP_DECL; + + ASSERT (an >= 0); + ASSERT (an == 0 || ap[an-1] != 0); + ASSERT (dn >= 1); + ASSERT (dp[dn-1] != 0); + ASSERT_MPN (ap, an); + ASSERT_MPN (dp, dn); + + /* When a= 1); + dn--; ASSERT (dn >= 1); + ap++; + dp++; + } + + /* a must have at least as many low zero bits as d */ + dmask = LOW_ZEROS_MASK (dlow); + if ((alow & dmask) != 0) + return 0; + + if (dn == 1) + { + if (ABOVE_THRESHOLD (an, BMOD_1_TO_MOD_1_THRESHOLD)) + return mpn_mod_1 (ap, an, dlow) == 0; + + count_trailing_zeros (twos, dlow); + dlow >>= twos; + return mpn_modexact_1_odd (ap, an, dlow) == 0; + } + + count_trailing_zeros (twos, dlow); + if (dn == 2) + { + mp_limb_t dsecond = dp[1]; + if (dsecond <= dmask) + { + dlow = (dlow >> twos) | (dsecond << (GMP_NUMB_BITS-twos)); + ASSERT_LIMB (dlow); + return MPN_MOD_OR_MODEXACT_1_ODD (ap, an, dlow) == 0; + } + } + + /* Should we compute Q = A * D^(-1) mod B^k, + R = A - Q * D mod B^k + here, for some small values of k? Then check if R = 0 (mod B^k). */ + + /* We could also compute A' = A mod T and D' = D mod P, for some + P = 3 * 5 * 7 * 11 ..., and then check if any prime factor from P + dividing D' also divides A'. */ + + TMP_MARK; + + TMP_ALLOC_LIMBS_2 (rp, an + 1, + qp, an - dn + 1); /* FIXME: Could we avoid this? */ + + if (twos != 0) + { + tp = TMP_ALLOC_LIMBS (dn); + ASSERT_NOCARRY (mpn_rshift (tp, dp, dn, twos)); + dp = tp; + + ASSERT_NOCARRY (mpn_rshift (rp, ap, an, twos)); + } + else + { + MPN_COPY (rp, ap, an); + } + if (rp[an - 1] >= dp[dn - 1]) + { + rp[an] = 0; + an++; + } + else if (an == dn) + { + TMP_FREE; + return 0; + } + + ASSERT (an > dn); /* requirement of functions below */ + + if (BELOW_THRESHOLD (dn, DC_BDIV_QR_THRESHOLD) || + BELOW_THRESHOLD (an - dn, DC_BDIV_QR_THRESHOLD)) + { + binvert_limb (di, dp[0]); + mpn_sbpi1_bdiv_qr (qp, rp, an, dp, dn, -di); + rp += an - dn; + } + else if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD)) + { + binvert_limb (di, dp[0]); + mpn_dcpi1_bdiv_qr (qp, rp, an, dp, dn, -di); + rp += an - dn; + } + else + { + tp = TMP_ALLOC_LIMBS (mpn_mu_bdiv_qr_itch (an, dn)); + mpn_mu_bdiv_qr (qp, rp, rp, an, dp, dn, tp); + } + + /* In general, bdiv may return either R = 0 or R = D when D divides + A. But R = 0 can happen only when A = 0, which we already have + excluded. Furthermore, R == D (mod B^{dn}) implies no carry, so + we don't need to check the carry returned from bdiv. */ + + MPN_CMP (c, rp, dp, dn); + + TMP_FREE; + return c == 0; +} diff --git a/gmp-6.3.0/mpn/generic/divrem.c b/gmp-6.3.0/mpn/generic/divrem.c new file mode 100644 index 0000000..1da84a8 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divrem.c @@ -0,0 +1,103 @@ +/* mpn_divrem -- Divide natural numbers, producing both remainder and + quotient. This is now just a middle layer calling mpn_tdiv_qr. + +Copyright 1993-1997, 1999-2002, 2005, 2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_divrem (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) +{ + ASSERT (qxn >= 0); + ASSERT (nn >= dn); + ASSERT (dn >= 1); + ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn)); + ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, np, nn) || qp==np+dn+qxn); + ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, dp, dn)); + ASSERT_MPN (np, nn); + ASSERT_MPN (dp, dn); + + if (dn == 1) + { + mp_limb_t ret; + mp_ptr q2p; + mp_size_t qn; + TMP_DECL; + + TMP_MARK; + q2p = TMP_ALLOC_LIMBS (nn + qxn); + + np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]); + qn = nn + qxn - 1; + MPN_COPY (qp, q2p, qn); + ret = q2p[qn]; + + TMP_FREE; + return ret; + } + else if (dn == 2) + { + return mpn_divrem_2 (qp, qxn, np, nn, dp); + } + else + { + mp_ptr q2p; + mp_limb_t qhl; + mp_size_t qn; + TMP_DECL; + + TMP_MARK; + if (UNLIKELY (qxn != 0)) + { + mp_ptr n2p; + TMP_ALLOC_LIMBS_2 (n2p, nn + qxn, + q2p, nn - dn + qxn + 1); + MPN_ZERO (n2p, qxn); + MPN_COPY (n2p + qxn, np, nn); + mpn_tdiv_qr (q2p, np, 0L, n2p, nn + qxn, dp, dn); + qn = nn - dn + qxn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + else + { + q2p = TMP_ALLOC_LIMBS (nn - dn + 1); + mpn_tdiv_qr (q2p, np, 0L, np, nn, dp, dn); + qn = nn - dn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + TMP_FREE; + return qhl; + } +} diff --git a/gmp-6.3.0/mpn/generic/divrem_1.c b/gmp-6.3.0/mpn/generic/divrem_1.c new file mode 100644 index 0000000..c13aa79 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divrem_1.c @@ -0,0 +1,254 @@ +/* mpn_divrem_1 -- mpn by limb division. + +Copyright 1991, 1993, 1994, 1996, 1998-2000, 2002, 2003 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd, + meaning the quotient size where that should happen, the quotient size + being how many udiv divisions will be done. + + The default is to use preinv always, CPUs where this doesn't suit have + tuned thresholds. Note in particular that preinv should certainly be + used if that's the only division available (USE_PREINV_ALWAYS). */ + +#ifndef DIVREM_1_NORM_THRESHOLD +#define DIVREM_1_NORM_THRESHOLD 0 +#endif +#ifndef DIVREM_1_UNNORM_THRESHOLD +#define DIVREM_1_UNNORM_THRESHOLD 0 +#endif + + + +/* If the cpu only has multiply-by-inverse division (eg. alpha), then NORM + and UNNORM thresholds are 0 and only the inversion code is included. + + If multiply-by-inverse is never viable, then NORM and UNNORM thresholds + will be MP_SIZE_T_MAX and only the plain division code is included. + + Otherwise mul-by-inverse is better than plain division above some + threshold, and best results are obtained by having code for both present. + + The main reason for separating the norm and unnorm cases is that not all + CPUs give zero for "n0 >> GMP_LIMB_BITS" which would arise in the unnorm + code used on an already normalized divisor. + + If UDIV_NEEDS_NORMALIZATION is false then plain division uses the same + non-shifting code for both the norm and unnorm cases, though with + different criteria for skipping a division, and with different thresholds + of course. And in fact if inversion is never viable, then that simple + non-shifting division would be all that's left. + + The NORM and UNNORM thresholds might not differ much, but if there's + going to be separate code for norm and unnorm then it makes sense to have + separate thresholds. One thing that's possible is that the + mul-by-inverse might be better only for normalized divisors, due to that + case not needing variable bit shifts. + + Notice that the thresholds are tested after the decision to possibly skip + one divide step, so they're based on the actual number of divisions done. + + For the unnorm case, it would be possible to call mpn_lshift to adjust + the dividend all in one go (into the quotient space say), rather than + limb-by-limb in the loop. This might help if mpn_lshift is a lot faster + than what the compiler can generate for EXTRACT. But this is left to CPU + specific implementations to consider, especially since EXTRACT isn't on + the dependent chain. */ + +mp_limb_t +mpn_divrem_1 (mp_ptr qp, mp_size_t qxn, + mp_srcptr up, mp_size_t un, mp_limb_t d) +{ + mp_size_t n; + mp_size_t i; + mp_limb_t n1, n0; + mp_limb_t r = 0; + + ASSERT (qxn >= 0); + ASSERT (un >= 0); + ASSERT (d != 0); + /* FIXME: What's the correct overlap rule when qxn!=0? */ + ASSERT (MPN_SAME_OR_SEPARATE_P (qp+qxn, up, un)); + + n = un + qxn; + if (n == 0) + return 0; + + d <<= GMP_NAIL_BITS; + + qp += (n - 1); /* Make qp point at most significant quotient limb */ + + if ((d & GMP_LIMB_HIGHBIT) != 0) + { + if (un != 0) + { + /* High quotient limb is 0 or 1, skip a divide step. */ + mp_limb_t q; + r = up[un - 1] << GMP_NAIL_BITS; + q = (r >= d); + *qp-- = q; + r -= (d & -q); + r >>= GMP_NAIL_BITS; + n--; + un--; + } + + if (BELOW_THRESHOLD (n, DIVREM_1_NORM_THRESHOLD)) + { + plain: + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_qrnnd (*qp, r, r, n0, d); + r >>= GMP_NAIL_BITS; + qp--; + } + for (i = qxn - 1; i >= 0; i--) + { + udiv_qrnnd (*qp, r, r, CNST_LIMB(0), d); + r >>= GMP_NAIL_BITS; + qp--; + } + return r; + } + else + { + /* Multiply-by-inverse, divisor already normalized. */ + mp_limb_t dinv; + invert_limb (dinv, d); + + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_qrnnd_preinv (*qp, r, r, n0, d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + } + for (i = qxn - 1; i >= 0; i--) + { + udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + } + return r; + } + } + else + { + /* Most significant bit of divisor == 0. */ + int cnt; + + /* Skip a division if high < divisor (high quotient 0). Testing here + before normalizing will still skip as often as possible. */ + if (un != 0) + { + n1 = up[un - 1] << GMP_NAIL_BITS; + if (n1 < d) + { + r = n1 >> GMP_NAIL_BITS; + *qp-- = 0; + n--; + if (n == 0) + return r; + un--; + } + } + + if (! UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (n, DIVREM_1_UNNORM_THRESHOLD)) + goto plain; + + count_leading_zeros (cnt, d); + d <<= cnt; + r <<= cnt; + + if (UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (n, DIVREM_1_UNNORM_THRESHOLD)) + { + mp_limb_t nshift; + if (un != 0) + { + n1 = up[un - 1] << GMP_NAIL_BITS; + r |= (n1 >> (GMP_LIMB_BITS - cnt)); + for (i = un - 2; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)); + udiv_qrnnd (*qp, r, r, nshift, d); + r >>= GMP_NAIL_BITS; + qp--; + n1 = n0; + } + udiv_qrnnd (*qp, r, r, n1 << cnt, d); + r >>= GMP_NAIL_BITS; + qp--; + } + for (i = qxn - 1; i >= 0; i--) + { + udiv_qrnnd (*qp, r, r, CNST_LIMB(0), d); + r >>= GMP_NAIL_BITS; + qp--; + } + return r >> cnt; + } + else + { + mp_limb_t dinv, nshift; + invert_limb (dinv, d); + if (un != 0) + { + n1 = up[un - 1] << GMP_NAIL_BITS; + r |= (n1 >> (GMP_LIMB_BITS - cnt)); + for (i = un - 2; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)); + udiv_qrnnd_preinv (*qp, r, r, nshift, d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + n1 = n0; + } + udiv_qrnnd_preinv (*qp, r, r, n1 << cnt, d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + } + for (i = qxn - 1; i >= 0; i--) + { + udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + } + return r >> cnt; + } + } +} diff --git a/gmp-6.3.0/mpn/generic/divrem_2.c b/gmp-6.3.0/mpn/generic/divrem_2.c new file mode 100644 index 0000000..217f2f6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divrem_2.c @@ -0,0 +1,118 @@ +/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and + quotient. The divisor is two limbs. + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1993-1996, 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Divide num {np,nn} by den {dp,2} and write the nn-2 least significant + quotient limbs at qp and the 2 long remainder at np. If qxn is non-zero, + generate that many fraction bits and append them after the other quotient + limbs. Return the most significant limb of the quotient, this is always 0 + or 1. + + Preconditions: + 1. The most significant bit of the divisor must be set. + 2. qp must either not overlap with the input operands at all, or + qp >= np + 2 must hold true. (This means that it's possible to put + the quotient in the high part of {np,nn}, right above the remainder. + 3. nn >= 2, even if qxn is non-zero. */ + +mp_limb_t +mpn_divrem_2 (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nn, + mp_srcptr dp) +{ + mp_limb_t most_significant_q_limb; + mp_size_t i; + mp_limb_t r1, r0, d1, d0; + gmp_pi1_t di; + + ASSERT (nn >= 2); + ASSERT (qxn >= 0); + ASSERT (dp[1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (qp, nn-2+qxn, np, nn) || qp >= np+2); + ASSERT_MPN (np, nn); + ASSERT_MPN (dp, 2); + + np += nn - 2; + d1 = dp[1]; + d0 = dp[0]; + r1 = np[1]; + r0 = np[0]; + + most_significant_q_limb = 0; + if (r1 >= d1 && (r1 > d1 || r0 >= d0)) + { +#if GMP_NAIL_BITS == 0 + sub_ddmmss (r1, r0, r1, r0, d1, d0); +#else + r0 = r0 - d0; + r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); + r0 &= GMP_NUMB_MASK; +#endif + most_significant_q_limb = 1; + } + + invert_pi1 (di, d1, d0); + + qp += qxn; + + for (i = nn - 2 - 1; i >= 0; i--) + { + mp_limb_t n0, q; + n0 = np[-1]; + udiv_qr_3by2 (q, r1, r0, r1, r0, n0, d1, d0, di.inv32); + np--; + qp[i] = q; + } + + if (UNLIKELY (qxn != 0)) + { + qp -= qxn; + for (i = qxn - 1; i >= 0; i--) + { + mp_limb_t q; + udiv_qr_3by2 (q, r1, r0, r1, r0, CNST_LIMB(0), d1, d0, di.inv32); + qp[i] = q; + } + } + + np[1] = r1; + np[0] = r0; + + return most_significant_q_limb; +} diff --git a/gmp-6.3.0/mpn/generic/dump.c b/gmp-6.3.0/mpn/generic/dump.c new file mode 100644 index 0000000..9a4ddf4 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dump.c @@ -0,0 +1,99 @@ +/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS NOT SAFE TO + CALL THIS FUNCTION DIRECTLY. IN FACT, IT IS ALMOST GUARANTEED THAT THIS + FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1996, 2000-2002, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +#if GMP_NUMB_BITS % 4 == 0 +void +mpn_dump (mp_srcptr ptr, mp_size_t n) +{ + MPN_NORMALIZE (ptr, n); + + if (n == 0) + printf ("0\n"); + else + { + n--; +#if _LONG_LONG_LIMB + if ((ptr[n] >> GMP_LIMB_BITS / 2) != 0) + { + printf ("%lX", (unsigned long) (ptr[n] >> GMP_LIMB_BITS / 2)); + printf ("%0*lX", (GMP_LIMB_BITS / 2 / 4), (unsigned long) ptr[n]); + } + else +#endif + printf ("%lX", (unsigned long) ptr[n]); + + while (n) + { + n--; +#if _LONG_LONG_LIMB + printf ("%0*lX", (GMP_NUMB_BITS - GMP_LIMB_BITS / 2) / 4, + (unsigned long) (ptr[n] >> GMP_LIMB_BITS / 2)); + printf ("%0*lX", GMP_LIMB_BITS / 2 / 4, (unsigned long) ptr[n]); +#else + printf ("%0*lX", GMP_NUMB_BITS / 4, (unsigned long) ptr[n]); +#endif + } + printf ("\n"); + } +} + +#else + +static void +mpn_recdump (mp_ptr p, mp_size_t n) +{ + mp_limb_t lo; + if (n != 0) + { + lo = p[0] & 0xf; + mpn_rshift (p, p, n, 4); + mpn_recdump (p, n); + printf ("%lX", lo); + } +} + +void +mpn_dump (mp_srcptr p, mp_size_t n) +{ + mp_ptr tp; + TMP_DECL; + TMP_MARK; + tp = TMP_ALLOC_LIMBS (n); + MPN_COPY (tp, p, n); + TMP_FREE; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/fib2_ui.c b/gmp-6.3.0/mpn/generic/fib2_ui.c new file mode 100644 index 0000000..0b81571 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/fib2_ui.c @@ -0,0 +1,174 @@ +/* mpn_fib2_ui -- calculate Fibonacci numbers. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +/* change this to "#define TRACE(x) x" for diagnostics */ +#define TRACE(x) + + +/* Store F[n] at fp and F[n-1] at f1p. fp and f1p should have room for + MPN_FIB2_SIZE(n) limbs. + + The return value is the actual number of limbs stored, this will be at + least 1. fp[size-1] will be non-zero, except when n==0, in which case + fp[0] is 0 and f1p[0] is 1. f1p[size-1] can be zero, since F[n-1]0). + + Notes: F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k. + + In F[2k+1] with k even, +2 is applied to 4*F[k]^2 just by ORing into the + low limb. + + In F[2k+1] with k odd, -2 is applied to F[k-1]^2 just by ORing into the + low limb. +*/ + +mp_size_t +mpn_fib2_ui (mp_ptr fp, mp_ptr f1p, unsigned long int n) +{ + mp_size_t size; + unsigned long nfirst, mask; + + TRACE (printf ("mpn_fib2_ui n=%lu\n", n)); + + ASSERT (! MPN_OVERLAP_P (fp, MPN_FIB2_SIZE(n), f1p, MPN_FIB2_SIZE(n))); + + /* Take a starting pair from the table. */ + mask = 1; + for (nfirst = n; nfirst > FIB_TABLE_LIMIT; nfirst /= 2) + mask <<= 1; + TRACE (printf ("nfirst=%lu mask=0x%lX\n", nfirst, mask)); + + f1p[0] = FIB_TABLE ((int) nfirst - 1); + fp[0] = FIB_TABLE (nfirst); + size = 1; + + /* Skip to the end if the table lookup gives the final answer. */ + if (mask != 1) + { + mp_size_t alloc; + mp_ptr xp; + TMP_DECL; + + TMP_MARK; + alloc = MPN_FIB2_SIZE (n); + xp = TMP_ALLOC_LIMBS (alloc); + + do + { + /* Here fp==F[k] and f1p==F[k-1], with k being the bits of n from + n&mask upwards. + + The next bit of n is n&(mask>>1) and we'll double to the pair + fp==F[2k],f1p==F[2k-1] or fp==F[2k+1],f1p==F[2k], according as + that bit is 0 or 1 respectively. */ + + TRACE (printf ("k=%lu mask=0x%lX size=%ld alloc=%ld\n", + n >> refmpn_count_trailing_zeros(mask), + mask, size, alloc); + mpn_trace ("fp ", fp, size); + mpn_trace ("f1p", f1p, size)); + + /* fp normalized, f1p at most one high zero */ + ASSERT (fp[size-1] != 0); + ASSERT (f1p[size-1] != 0 || f1p[size-2] != 0); + + /* f1p[size-1] might be zero, but this occurs rarely, so it's not + worth bothering checking for it */ + ASSERT (alloc >= 2*size); + mpn_sqr (xp, fp, size); + mpn_sqr (fp, f1p, size); + size *= 2; + + /* Shrink if possible. Since fp was normalized there'll be at + most one high zero on xp (and if there is then there's one on + yp too). */ + ASSERT (xp[size-1] != 0 || fp[size-1] == 0); + size -= (xp[size-1] == 0); + ASSERT (xp[size-1] != 0); /* only one xp high zero */ + + /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2. */ + f1p[size] = mpn_add_n (f1p, xp, fp, size); + + /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k. + n&mask is the low bit of our implied k. */ + + ASSERT ((fp[0] & 2) == 0); + /* fp is F[k-1]^2 == 0 or 1 mod 4, like all squares. */ + fp[0] |= (n & mask ? 2 : 0); /* possible -2 */ +#if HAVE_NATIVE_mpn_rsblsh2_n + fp[size] = mpn_rsblsh2_n (fp, fp, xp, size); + MPN_INCR_U(fp, size + 1, (n & mask ? 0 : 2)); /* possible +2 */ +#else + { + mp_limb_t c; + + c = mpn_lshift (xp, xp, size, 2); + xp[0] |= (n & mask ? 0 : 2); /* possible +2 */ + c -= mpn_sub_n (fp, xp, fp, size); + fp[size] = c; + } +#endif + ASSERT (alloc >= size+1); + size += (fp[size] != 0); + + /* now n&mask is the new bit of n being considered */ + mask >>= 1; + + /* Calculate F[2k] = F[2k+1] - F[2k-1], replacing the unwanted one of + F[2k+1] and F[2k-1]. */ + if (n & mask) + ASSERT_NOCARRY (mpn_sub_n (f1p, fp, f1p, size)); + else { + ASSERT_NOCARRY (mpn_sub_n ( fp, fp, f1p, size)); + + /* Can have a high zero after replacing F[2k+1] with F[2k]. + f1p will have a high zero if fp does. */ + ASSERT (fp[size-1] != 0 || f1p[size-1] == 0); + size -= (fp[size-1] == 0); + } + } + while (mask != 1); + + TMP_FREE; + } + + TRACE (printf ("done size=%ld\n", size); + mpn_trace ("fp ", fp, size); + mpn_trace ("f1p", f1p, size)); + + return size; +} diff --git a/gmp-6.3.0/mpn/generic/fib2m.c b/gmp-6.3.0/mpn/generic/fib2m.c new file mode 100644 index 0000000..89d2b86 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/fib2m.c @@ -0,0 +1,252 @@ +/* mpn_fib2m -- calculate Fibonacci numbers, modulo m. + +Contributed to the GNU project by Marco Bodrato, based on the previous +fib2_ui.c file. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" +#include "longlong.h" + + +/* Stores |{ap,n}-{bp,n}| in {rp,n}, + returns the sign of {ap,n}-{bp,n}. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + mp_limb_t x, y; + while (--n >= 0) + { + x = ap[n]; + y = bp[n]; + if (x != y) + { + ++n; + if (x > y) + { + ASSERT_NOCARRY (mpn_sub_n (rp, ap, bp, n)); + return 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (rp, bp, ap, n)); + return -1; + } + } + rp[n] = 0; + } + return 0; +} + +/* Store F[n] at fp and F[n-1] at f1p. Both are computed modulo m. + fp and f1p should have room for mn*2+1 limbs. + + The sign of one or both the values may be flipped (n-F, instead of F), + the return value is 0 (zero) if the signs are coherent (both positive + or both negative) and 1 (one) otherwise. + + Notes: + + In F[2k+1] with k even, +2 is applied to 4*F[k]^2 just by ORing into the + low limb. + + In F[2k+1] with k odd, -2 is applied to F[k-1]^2 just by ORing into the + low limb. + + TODO: Should {tp, 2 * mn} be passed as a scratch pointer? + Should the call to mpn_fib2_ui() obtain (up to) 2*mn limbs? +*/ + +int +mpn_fib2m (mp_ptr fp, mp_ptr f1p, mp_srcptr np, mp_size_t nn, mp_srcptr mp, mp_size_t mn) +{ + unsigned long nfirst; + mp_limb_t nh; + mp_bitcnt_t nbi; + mp_size_t sn, fn; + int fcnt, ncnt; + + ASSERT (! MPN_OVERLAP_P (fp, MAX(2*mn+1,5), f1p, MAX(2*mn+1,5))); + ASSERT (nn > 0 && np[nn - 1] != 0); + + /* Estimate the maximal n such that fibonacci(n) fits in mn limbs. */ +#if GMP_NUMB_BITS % 16 == 0 + if (UNLIKELY (ULONG_MAX / (23 * (GMP_NUMB_BITS / 16)) <= mn)) + nfirst = ULONG_MAX; + else + nfirst = mn * (23 * (GMP_NUMB_BITS / 16)); +#else + { + mp_bitcnt_t mbi; + mbi = (mp_bitcnt_t) mn * GMP_NUMB_BITS; + + if (UNLIKELY (ULONG_MAX / 23 < mbi)) + { + if (UNLIKELY (ULONG_MAX / 23 * 16 <= mbi)) + nfirst = ULONG_MAX; + else + nfirst = mbi / 16 * 23; + } + else + nfirst = mbi * 23 / 16; + } +#endif + + sn = nn - 1; + nh = np[sn]; + count_leading_zeros (ncnt, nh); + count_leading_zeros (fcnt, nfirst); + + if (fcnt >= ncnt) + { + ncnt = fcnt - ncnt; + nh >>= ncnt; + } + else if (sn > 0) + { + ncnt -= fcnt; + nh <<= ncnt; + ncnt = GMP_NUMB_BITS - ncnt; + --sn; + nh |= np[sn] >> ncnt; + } + else + ncnt = 0; + + nbi = sn * GMP_NUMB_BITS + ncnt; + if (nh > nfirst) + { + nh >>= 1; + ++nbi; + } + + ASSERT (nh <= nfirst); + /* Take a starting pair from mpn_fib2_ui. */ + fn = mpn_fib2_ui (fp, f1p, nh); + MPN_ZERO (fp + fn, mn - fn); + MPN_ZERO (f1p + fn, mn - fn); + + if (nbi == 0) + { + if (fn == mn) + { + mp_limb_t qp[2]; + mpn_tdiv_qr (qp, fp, 0, fp, fn, mp, mn); + mpn_tdiv_qr (qp, f1p, 0, f1p, fn, mp, mn); + } + + return 0; + } + else + { + mp_ptr tp; + unsigned pb = nh & 1; + int neg; + TMP_DECL; + + TMP_MARK; + + tp = TMP_ALLOC_LIMBS (2 * mn + (mn < 2)); + + do + { + mp_ptr rp; + /* Here fp==F[k] and f1p==F[k-1], with k being the bits of n from + nbi upwards. + + Based on the next bit of n, we'll double to the pair + fp==F[2k],f1p==F[2k-1] or fp==F[2k+1],f1p==F[2k], according as + that bit is 0 or 1 respectively. */ + + mpn_sqr (tp, fp, mn); + mpn_sqr (fp, f1p, mn); + + /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2. */ + f1p[2 * mn] = mpn_add_n (f1p, tp, fp, 2 * mn); + + /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k. + pb is the low bit of our implied k. */ + + /* fp is F[k-1]^2 == 0 or 1 mod 4, like all squares. */ + ASSERT ((fp[0] & 2) == 0); + ASSERT (pb == (pb & 1)); + ASSERT ((fp[0] + (pb ? 2 : 0)) == (fp[0] | (pb << 1))); + fp[0] |= pb << 1; /* possible -2 */ +#if HAVE_NATIVE_mpn_rsblsh2_n + fp[2 * mn] = 1 + mpn_rsblsh2_n (fp, fp, tp, 2 * mn); + MPN_INCR_U(fp, 2 * mn + 1, (1 ^ pb) << 1); /* possible +2 */ + fp[2 * mn] = (fp[2 * mn] - 1) & GMP_NUMB_MAX; +#else + { + mp_limb_t c; + + c = mpn_lshift (tp, tp, 2 * mn, 2); + tp[0] |= (1 ^ pb) << 1; /* possible +2 */ + c -= mpn_sub_n (fp, tp, fp, 2 * mn); + fp[2 * mn] = c & GMP_NUMB_MAX; + } +#endif + neg = fp[2 * mn] == GMP_NUMB_MAX; + + /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2 */ + /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k */ + + /* Calculate F[2k] = F[2k+1] - F[2k-1], replacing the unwanted one of + F[2k+1] and F[2k-1]. */ + --nbi; + pb = (np [nbi / GMP_NUMB_BITS] >> (nbi % GMP_NUMB_BITS)) & 1; + rp = pb ? f1p : fp; + if (neg) + { + /* Calculate -(F[2k+1] - F[2k-1]) */ + rp[2 * mn] = f1p[2 * mn] + 1 - mpn_sub_n (rp, f1p, fp, 2 * mn); + neg = ! pb; + if (pb) /* fp not overwritten, negate it. */ + fp [2 * mn] = 1 ^ mpn_neg (fp, fp, 2 * mn); + } + else + { + neg = abs_sub_n (rp, fp, f1p, 2 * mn + 1) < 0; + } + + mpn_tdiv_qr (tp, fp, 0, fp, 2 * mn + 1, mp, mn); + mpn_tdiv_qr (tp, f1p, 0, f1p, 2 * mn + 1, mp, mn); + } + while (nbi != 0); + + TMP_FREE; + + return neg; + } +} diff --git a/gmp-6.3.0/mpn/generic/gcd.c b/gmp-6.3.0/mpn/generic/gcd.c new file mode 100644 index 0000000..3f92cbf --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd.c @@ -0,0 +1,266 @@ +/* mpn/gcd.c: mpn_gcd for gcd of two odd integers. + +Copyright 1991, 1993-1998, 2000-2005, 2008, 2010, 2012, 2019 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Uses the HGCD operation described in + + N. Möller, On Schönhage's algorithm and subquadratic integer gcd + computation, Math. Comp. 77 (2008), 589-607. + + to reduce inputs until they are of size below GCD_DC_THRESHOLD, and + then uses Lehmer's algorithm. +*/ + +/* Some reasonable choices are n / 2 (same as in hgcd), and p = (n + + * 2)/3, which gives a balanced multiplication in + * mpn_hgcd_matrix_adjust. However, p = 2 n/3 gives slightly better + * performance. The matrix-vector multiplication is then + * 4:1-unbalanced, with matrix elements of size n/6, and vector + * elements of size p = 2n/3. */ + +/* From analysis of the theoretical running time, it appears that when + * multiplication takes time O(n^alpha), p should be chosen so that + * the ratio of the time for the mpn_hgcd call, and the time for the + * multiplication in mpn_hgcd_matrix_adjust, is roughly 1/(alpha - + * 1). */ +#ifdef TUNE_GCD_P +#define P_TABLE_SIZE 10000 +mp_size_t p_table[P_TABLE_SIZE]; +#define CHOOSE_P(n) ( (n) < P_TABLE_SIZE ? p_table[n] : 2*(n)/3) +#else +#define CHOOSE_P(n) (2*(n) / 3) +#endif + +struct gcd_ctx +{ + mp_ptr gp; + mp_size_t gn; +}; + +static void +gcd_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + struct gcd_ctx *ctx = (struct gcd_ctx *) p; + MPN_COPY (ctx->gp, gp, gn); + ctx->gn = gn; +} + +mp_size_t +mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n) +{ + mp_size_t talloc; + mp_size_t scratch; + mp_size_t matrix_scratch; + + struct gcd_ctx ctx; + mp_ptr tp; + TMP_DECL; + + ASSERT (usize >= n); + ASSERT (n > 0); + ASSERT (vp[n-1] > 0); + + /* FIXME: Check for small sizes first, before setting up temporary + storage etc. */ + talloc = MPN_GCD_SUBDIV_STEP_ITCH(n); + + /* For initial division */ + scratch = usize - n + 1; + if (scratch > talloc) + talloc = scratch; + +#if TUNE_GCD_P + if (CHOOSE_P (n) > 0) +#else + if (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD)) +#endif + { + mp_size_t hgcd_scratch; + mp_size_t update_scratch; + mp_size_t p = CHOOSE_P (n); + mp_size_t scratch; +#if TUNE_GCD_P + /* Worst case, since we don't guarantee that n - CHOOSE_P(n) + is increasing */ + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n); + hgcd_scratch = mpn_hgcd_itch (n); + update_scratch = 2*(n - 1); +#else + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + hgcd_scratch = mpn_hgcd_itch (n - p); + update_scratch = p + n - 1; +#endif + scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); + if (scratch > talloc) + talloc = scratch; + } + + TMP_MARK; + tp = TMP_ALLOC_LIMBS(talloc); + + if (usize > n) + { + mpn_tdiv_qr (tp, up, 0, up, usize, vp, n); + + if (mpn_zero_p (up, n)) + { + MPN_COPY (gp, vp, n); + ctx.gn = n; + goto done; + } + } + + ctx.gp = gp; + +#if TUNE_GCD_P + while (CHOOSE_P (n) > 0) +#else + while (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD)) +#endif + { + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P (n); + mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + mp_size_t nn; + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (up + p, vp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) + { + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + /* Temporary storage 2 (p + M->n) <= p + n - 1. */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, up, vp, p, tp + matrix_scratch); + } + else + { + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (up, vp, n, 0, gcd_hook, &ctx, tp); + if (n == 0) + goto done; + } + } + + while (n > 2) + { + struct hgcd_matrix1 M; + mp_limb_t uh, ul, vh, vl; + mp_limb_t mask; + + mask = up[n-1] | vp[n-1]; + ASSERT (mask > 0); + + if (mask & GMP_NUMB_HIGHBIT) + { + uh = up[n-1]; ul = up[n-2]; + vh = vp[n-1]; vl = vp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + uh = MPN_EXTRACT_NUMB (shift, up[n-1], up[n-2]); + ul = MPN_EXTRACT_NUMB (shift, up[n-2], up[n-3]); + vh = MPN_EXTRACT_NUMB (shift, vp[n-1], vp[n-2]); + vl = MPN_EXTRACT_NUMB (shift, vp[n-2], vp[n-3]); + } + + /* Try an mpn_hgcd2 step */ + if (mpn_hgcd2 (uh, ul, vh, vl, &M)) + { + n = mpn_matrix22_mul1_inverse_vector (&M, tp, up, vp, n); + MP_PTR_SWAP (up, tp); + } + else + { + /* mpn_hgcd2 has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (up, vp, n, 0, &gcd_hook, &ctx, tp); + if (n == 0) + goto done; + } + } + + ASSERT(up[n-1] | vp[n-1]); + + /* Due to the calling convention for mpn_gcd, at most one can be even. */ + if ((up[0] & 1) == 0) + MP_PTR_SWAP (up, vp); + ASSERT ((up[0] & 1) != 0); + + { + mp_limb_t u0, u1, v0, v1; + mp_double_limb_t g; + + u0 = up[0]; + v0 = vp[0]; + + if (n == 1) + { + int cnt; + count_trailing_zeros (cnt, v0); + *gp = mpn_gcd_11 (u0, v0 >> cnt); + ctx.gn = 1; + goto done; + } + + v1 = vp[1]; + if (UNLIKELY (v0 == 0)) + { + v0 = v1; + v1 = 0; + /* FIXME: We could invoke a mpn_gcd_21 here, just like mpn_gcd_22 could + when this situation occurs internally. */ + } + if ((v0 & 1) == 0) + { + int cnt; + count_trailing_zeros (cnt, v0); + v0 = ((v1 << (GMP_NUMB_BITS - cnt)) & GMP_NUMB_MASK) | (v0 >> cnt); + v1 >>= cnt; + } + + u1 = up[1]; + g = mpn_gcd_22 (u1, u0, v1, v0); + gp[0] = g.d0; + gp[1] = g.d1; + ctx.gn = 1 + (g.d1 > 0); + } +done: + TMP_FREE; + return ctx.gn; +} diff --git a/gmp-6.3.0/mpn/generic/gcd_1.c b/gmp-6.3.0/mpn/generic/gcd_1.c new file mode 100644 index 0000000..22b1422 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd_1.c @@ -0,0 +1,103 @@ +/* mpn_gcd_1 -- mpn and limb greatest common divisor. + +Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Does not work for U == 0 or V == 0. It would be tough to make it work for + V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t. + + The threshold for doing u%v when size==1 will vary by CPU according to + the speed of a division and the code generated for the main loop. Any + tuning for this is left to a CPU specific implementation. */ + +mp_limb_t +mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb) +{ + mp_limb_t ulimb; + unsigned long zero_bits, u_low_zero_bits; + int c; + + ASSERT (size >= 1); + ASSERT (vlimb != 0); + ASSERT_MPN_NONZERO_P (up, size); + + ulimb = up[0]; + + /* Need vlimb odd for modexact, want it odd to get common zeros. */ + count_trailing_zeros (zero_bits, vlimb); + vlimb >>= zero_bits; + + if (size > 1) + { + /* Must get common zeros before the mod reduction. If ulimb==0 then + vlimb already gives the common zeros. */ + if (ulimb != 0) + { + count_trailing_zeros (u_low_zero_bits, ulimb); + zero_bits = MIN (zero_bits, u_low_zero_bits); + } + + ulimb = MPN_MOD_OR_MODEXACT_1_ODD (up, size, vlimb); + if (ulimb == 0) + goto done; + + count_trailing_zeros (c, ulimb); + ulimb >>= c; + } + else + { + /* size==1, so up[0]!=0 */ + count_trailing_zeros (u_low_zero_bits, ulimb); + ulimb >>= u_low_zero_bits; + zero_bits = MIN (zero_bits, u_low_zero_bits); + + /* make u bigger */ + if (vlimb > ulimb) + MP_LIMB_T_SWAP (ulimb, vlimb); + + /* if u is much bigger than v, reduce using a division rather than + chipping away at it bit-by-bit */ + if ((ulimb >> 16) > vlimb) + { + ulimb %= vlimb; + if (ulimb == 0) + goto done; + + count_trailing_zeros (c, ulimb); + ulimb >>= c; + } + } + + vlimb = mpn_gcd_11 (ulimb, vlimb); + + done: + return vlimb << zero_bits; +} diff --git a/gmp-6.3.0/mpn/generic/gcd_11.c b/gmp-6.3.0/mpn/generic/gcd_11.c new file mode 100644 index 0000000..214e45c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd_11.c @@ -0,0 +1,74 @@ +/* mpn_gcd_11 -- limb greatest common divisor. + +Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_gcd_11 (mp_limb_t u, mp_limb_t v) +{ + ASSERT (u & v & 1); + + /* In this loop, we represent the odd numbers ulimb and vlimb + without the redundant least significant one bit. This reduction + in size by one bit ensures that the high bit of t, below, is set + if and only if vlimb > ulimb. */ + + u >>= 1; + v >>= 1; + + while (u != v) + { + mp_limb_t t; + mp_limb_t vgtu; + int c; + + t = u - v; + vgtu = LIMB_HIGHBIT_TO_MASK (t); + + /* v <-- min (u, v) */ + v += (vgtu & t); + + /* u <-- |u - v| */ + u = (t ^ vgtu) - vgtu; + + count_trailing_zeros (c, t); + /* We have c <= GMP_LIMB_BITS - 2 here, so that + + ulimb >>= (c + 1); + + would be safe. But unlike the addition c + 1, a separate + shift by 1 is independent of c, and can be executed in + parallel with count_trailing_zeros. */ + u = (u >> 1) >> c; + } + return (u << 1) + 1; +} diff --git a/gmp-6.3.0/mpn/generic/gcd_22.c b/gmp-6.3.0/mpn/generic/gcd_22.c new file mode 100644 index 0000000..d97f096 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd_22.c @@ -0,0 +1,131 @@ +/* mpn_gcd_22 -- double limb greatest common divisor. + +Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#if GMP_NAIL_BITS > 0 +#error Nails not supported. +#endif + +mp_double_limb_t +mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0) +{ + mp_double_limb_t g; + ASSERT (u0 & v0 & 1); + + /* Implicit least significant bit */ + u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1)); + u1 >>= 1; + + v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1)); + v1 >>= 1; + + while (u1 || v1) /* u1 == 0 can happen at most twice per call */ + { + mp_limb_t vgtu, t1, t0; + sub_ddmmss (t1, t0, u1, u0, v1, v0); + vgtu = LIMB_HIGHBIT_TO_MASK(t1); + + if (UNLIKELY (t0 == 0)) + { + if (t1 == 0) + { + g.d1 = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1)); + g.d0 = (u0 << 1) | 1; + return g; + } + int c; + count_trailing_zeros (c, t1); + + /* v1 = min (u1, v1) */ + v1 += (vgtu & t1); + /* u0 = |u1 - v1| */ + u0 = (t1 ^ vgtu) - vgtu; + ASSERT (c < GMP_LIMB_BITS - 1); + u0 >>= c + 1; + u1 = 0; + } + else + { + int c; + count_trailing_zeros (c, t0); + c++; + /* V <-- min (U, V). + + Assembly version should use cmov. Another alternative, + avoiding carry propagation, would be + + v0 += vgtu & t0; v1 += vtgu & (u1 - v1); + */ + add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0); + /* U <-- |U - V| + No carry handling needed in this conditional negation, + since t0 != 0. */ + u0 = (t0 ^ vgtu) - vgtu; + u1 = t1 ^ vgtu; + if (UNLIKELY (c == GMP_LIMB_BITS)) + { + u0 = u1; + u1 = 0; + } + else + { + u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c)); + u1 >>= c; + } + } + } + while ((v0 | u0) & GMP_LIMB_HIGHBIT) + { /* At most two iterations */ + mp_limb_t vgtu, t0; + int c; + sub_ddmmss (vgtu, t0, 0, u0, 0, v0); + if (UNLIKELY (t0 == 0)) + { + g.d1 = u0 >> (GMP_LIMB_BITS - 1); + g.d0 = (u0 << 1) | 1; + return g; + } + + /* v <-- min (u, v) */ + v0 += (vgtu & t0); + + /* u <-- |u - v| */ + u0 = (t0 ^ vgtu) - vgtu; + + count_trailing_zeros (c, t0); + u0 = (u0 >> 1) >> c; + } + + g.d0 = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1); + g.d1 = 0; + return g; +} diff --git a/gmp-6.3.0/mpn/generic/gcd_subdiv_step.c b/gmp-6.3.0/mpn/generic/gcd_subdiv_step.c new file mode 100644 index 0000000..9c3b88d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd_subdiv_step.c @@ -0,0 +1,204 @@ +/* gcd_subdiv_step.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include /* for NULL */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or + b is small, or the difference is small. Perform one subtraction + followed by one division. The normal case is to compute the reduced + a and b, and return the new size. + + If s == 0 (used for gcd and gcdext), returns zero if the gcd is + found. + + If s > 0, don't reduce to size <= s, and return zero if no + reduction is possible (if either a, b or |a-b| is of size <= s). */ + +/* The hook function is called as + + hook(ctx, gp, gn, qp, qn, d) + + in the following cases: + + + If A = B at the start, G is the gcd, Q is NULL, d = -1. + + + If one input is zero at the start, G is the gcd, Q is NULL, + d = 0 if A = G and d = 1 if B = G. + + Otherwise, if d = 0 we have just subtracted a multiple of A from B, + and if d = 1 we have subtracted a multiple of B from A. + + + If A = B after subtraction, G is the gcd, Q is NULL. + + + If we get a zero remainder after division, G is the gcd, Q is the + quotient. + + + Otherwise, G is NULL, Q is the quotient (often 1). + + */ + +mp_size_t +mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s, + gcd_subdiv_step_hook *hook, void *ctx, + mp_ptr tp) +{ + static const mp_limb_t one = CNST_LIMB(1); + mp_size_t an, bn, qn; + + int swapped; + + ASSERT (n > 0); + ASSERT (ap[n-1] > 0 || bp[n-1] > 0); + + an = bn = n; + MPN_NORMALIZE (ap, an); + MPN_NORMALIZE (bp, bn); + + swapped = 0; + + /* Arrange so that a < b, subtract b -= a, and maintain + normalization. */ + if (an == bn) + { + int c; + MPN_CMP (c, ap, bp, an); + if (UNLIKELY (c == 0)) + { + /* For gcdext, return the smallest of the two cofactors, so + pass d = -1. */ + if (s == 0) + hook (ctx, ap, an, NULL, 0, -1); + return 0; + } + else if (c > 0) + { + MP_PTR_SWAP (ap, bp); + swapped ^= 1; + } + } + else + { + if (an > bn) + { + MPN_PTR_SWAP (ap, an, bp, bn); + swapped ^= 1; + } + } + if (an <= s) + { + if (s == 0) + hook (ctx, bp, bn, NULL, 0, swapped ^ 1); + return 0; + } + + ASSERT_NOCARRY (mpn_sub (bp, bp, bn, ap, an)); + MPN_NORMALIZE (bp, bn); + ASSERT (bn > 0); + + if (bn <= s) + { + /* Undo subtraction. */ + mp_limb_t cy = mpn_add (bp, ap, an, bp, bn); + if (cy > 0) + bp[an] = cy; + return 0; + } + + /* Arrange so that a < b */ + if (an == bn) + { + int c; + MPN_CMP (c, ap, bp, an); + if (UNLIKELY (c == 0)) + { + if (s > 0) + /* Just record subtraction and return */ + hook (ctx, NULL, 0, &one, 1, swapped); + else + /* Found gcd. */ + hook (ctx, bp, bn, NULL, 0, swapped); + return 0; + } + + hook (ctx, NULL, 0, &one, 1, swapped); + + if (c > 0) + { + MP_PTR_SWAP (ap, bp); + swapped ^= 1; + } + } + else + { + hook (ctx, NULL, 0, &one, 1, swapped); + + if (an > bn) + { + MPN_PTR_SWAP (ap, an, bp, bn); + swapped ^= 1; + } + } + + mpn_tdiv_qr (tp, bp, 0, bp, bn, ap, an); + qn = bn - an + 1; + bn = an; + MPN_NORMALIZE (bp, bn); + + if (UNLIKELY (bn <= s)) + { + if (s == 0) + { + hook (ctx, ap, an, tp, qn, swapped); + return 0; + } + + /* Quotient is one too large, so decrement it and add back A. */ + if (bn > 0) + { + mp_limb_t cy = mpn_add (bp, ap, an, bp, bn); + if (cy) + bp[an++] = cy; + } + else + MPN_COPY (bp, ap, an); + + MPN_DECR_U (tp, qn, 1); + } + + hook (ctx, NULL, 0, tp, qn, swapped); + return an; +} diff --git a/gmp-6.3.0/mpn/generic/gcdext.c b/gmp-6.3.0/mpn/generic/gcdext.c new file mode 100644 index 0000000..5501480 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcdext.c @@ -0,0 +1,557 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and + the size is returned (if inputs are non-normalized, result may be + non-normalized too). Temporary space needed is M->n + n. + */ +static size_t +hgcd_mul_matrix_vector (struct hgcd_matrix *M, + mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) +{ + mp_limb_t ah, bh; + + /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as + + t = u00 * a + r = u10 * b + r += t; + + t = u11 * b + b = u01 * a + b += t; + */ + + if (M->n >= n) + { + mpn_mul (tp, M->p[0][0], M->n, ap, n); + mpn_mul (rp, M->p[1][0], M->n, bp, n); + } + else + { + mpn_mul (tp, ap, n, M->p[0][0], M->n); + mpn_mul (rp, bp, n, M->p[1][0], M->n); + } + + ah = mpn_add_n (rp, rp, tp, n + M->n); + + if (M->n >= n) + { + mpn_mul (tp, M->p[1][1], M->n, bp, n); + mpn_mul (bp, M->p[0][1], M->n, ap, n); + } + else + { + mpn_mul (tp, bp, n, M->p[1][1], M->n); + mpn_mul (bp, ap, n, M->p[0][1], M->n); + } + bh = mpn_add_n (bp, bp, tp, n + M->n); + + n += M->n; + if ( (ah | bh) > 0) + { + rp[n] = ah; + bp[n] = bh; + n++; + } + else + { + /* Normalize */ + while ( (rp[n-1] | bp[n-1]) == 0) + n--; + } + + return n; +} + +#define COMPUTE_V_ITCH(n) (2*(n)) + +/* Computes |v| = |(g - u a)| / b, where u may be positive or + negative, and v is of the opposite sign. max(a, b) is of size n, u and + v at most size n, and v must have space for n+1 limbs. */ +static mp_size_t +compute_v (mp_ptr vp, + mp_srcptr ap, mp_srcptr bp, mp_size_t n, + mp_srcptr gp, mp_size_t gn, + mp_srcptr up, mp_size_t usize, + mp_ptr tp) +{ + mp_size_t size; + mp_size_t an; + mp_size_t bn; + mp_size_t vn; + + ASSERT (n > 0); + ASSERT (gn > 0); + ASSERT (usize != 0); + + size = ABS (usize); + ASSERT (size <= n); + ASSERT (up[size-1] > 0); + + an = n; + MPN_NORMALIZE (ap, an); + ASSERT (gn <= an); + + if (an >= size) + mpn_mul (tp, ap, an, up, size); + else + mpn_mul (tp, up, size, ap, an); + + size += an; + + if (usize > 0) + { + /* |v| = -v = (u a - g) / b */ + + ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); + MPN_NORMALIZE (tp, size); + if (size == 0) + return 0; + } + else + { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a, + (g + |u| a) always fits in (|usize| + an) limbs. */ + + ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn)); + size -= (tp[size - 1] == 0); + } + + /* Now divide t / b. There must be no remainder */ + bn = n; + MPN_NORMALIZE (bp, bn); + ASSERT (size >= bn); + + vn = size + 1 - bn; + ASSERT (vn <= n + 1); + + mpn_divexact (vp, tp, size, bp, bn); + vn -= (vp[vn-1] == 0); + + return vn; +} + +/* Temporary storage: + + Initial division: Quotient of at most an - n + 1 <= an limbs. + + Storage for u0 and u1: 2(n+1). + + Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) + + Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. + + When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. + + When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. + + For the lehmer call after the loop, Let T denote + GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for + u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T + for u, T+1 for v and 2T scratch space. In all, 7T + 3 is + sufficient for both operations. + +*/ + +/* Optimal choice of p seems difficult. In each iteration the division + * of work between hgcd and the updates of u0 and u1 depends on the + * current size of the u. It may be desirable to use a different + * choice of p in each iteration. Also the input size seems to matter; + * choosing p = n / 3 in the first iteration seems to improve + * performance slightly for input size just above the threshold, but + * degrade performance for larger inputs. */ +#define CHOOSE_P_1(n) ((n) / 2) +#define CHOOSE_P_2(n) ((n) / 3) + +mp_size_t +mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, + mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) +{ + mp_size_t talloc; + mp_size_t scratch; + mp_size_t matrix_scratch; + mp_size_t ualloc = n + 1; + + struct gcdext_ctx ctx; + mp_size_t un; + mp_ptr u0; + mp_ptr u1; + + mp_ptr tp; + + TMP_DECL; + + ASSERT (an >= n); + ASSERT (n > 0); + ASSERT (bp[n-1] > 0); + + TMP_MARK; + + /* FIXME: Check for small sizes first, before setting up temporary + storage etc. */ + talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); + + /* For initial division */ + scratch = an - n + 1; + if (scratch > talloc) + talloc = scratch; + + if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) + { + /* For hgcd loop. */ + mp_size_t hgcd_scratch; + mp_size_t update_scratch; + mp_size_t p1 = CHOOSE_P_1 (n); + mp_size_t p2 = CHOOSE_P_2 (n); + mp_size_t min_p = MIN(p1, p2); + mp_size_t max_p = MAX(p1, p2); + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); + hgcd_scratch = mpn_hgcd_itch (n - min_p); + update_scratch = max_p + n - 1; + + scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); + if (scratch > talloc) + talloc = scratch; + + /* Final mpn_gcdext_lehmer_n call. Need space for u and for + copies of a and b. */ + scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) + + 3*GCDEXT_DC_THRESHOLD; + + if (scratch > talloc) + talloc = scratch; + + /* Cofactors u0 and u1 */ + talloc += 2*(n+1); + } + + tp = TMP_ALLOC_LIMBS(talloc); + + if (an > n) + { + mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); + + if (mpn_zero_p (ap, n)) + { + MPN_COPY (gp, bp, n); + *usizep = 0; + TMP_FREE; + return n; + } + } + + if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) + { + mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); + + TMP_FREE; + return gn; + } + + MPN_ZERO (tp, 2*ualloc); + u0 = tp; tp += ualloc; + u1 = tp; tp += ualloc; + + ctx.gp = gp; + ctx.up = up; + ctx.usize = usizep; + + { + /* For the first hgcd call, there are no u updates, and it makes + some sense to use a different choice for p. */ + + /* FIXME: We could trim use of temporary storage, since u0 and u1 + are not used yet. For the hgcd call, we could swap in the u0 + and u1 pointers for the relevant matrix elements. */ + + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P_1 (n); + mp_size_t nn; + + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) + { + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + + /* Temporary storage 2 (p + M->n) <= p + n - 1 */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); + + MPN_COPY (u0, M.p[1][0], M.n); + MPN_COPY (u1, M.p[1][1], M.n); + un = M.n; + while ( (u0[un-1] | u1[un-1] ) == 0) + un--; + } + else + { + /* mpn_hgcd has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + u1[0] = 1; + + ctx.u0 = u0; + ctx.u1 = u1; + ctx.tp = tp + n; /* ualloc */ + ctx.un = 1; + + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); + if (n == 0) + { + TMP_FREE; + return ctx.gn; + } + + un = ctx.un; + ASSERT (un < ualloc); + } + } + + while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) + { + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P_2 (n); + mp_size_t nn; + + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) + { + mp_ptr t0; + + t0 = tp + matrix_scratch; + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + + /* Temporary storage 2 (p + M->n) <= p + n - 1 */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); + + /* By the same analysis as for mpn_hgcd_matrix_mul */ + ASSERT (M.n + un <= ualloc); + + /* FIXME: This copying could be avoided by some swapping of + * pointers. May need more temporary storage, though. */ + MPN_COPY (t0, u0, un); + + /* Temporary storage ualloc */ + un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); + + ASSERT (un < ualloc); + ASSERT ( (u0[un-1] | u1[un-1]) > 0); + } + else + { + /* mpn_hgcd has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + ctx.u0 = u0; + ctx.u1 = u1; + ctx.tp = tp + n; /* ualloc */ + ctx.un = un; + + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); + if (n == 0) + { + TMP_FREE; + return ctx.gn; + } + + un = ctx.un; + ASSERT (un < ualloc); + } + } + /* We have A = ... a + ... b + B = u0 a + u1 b + + a = u1 A + ... B + b = -u0 A + ... B + + with bounds + + |u0|, |u1| <= B / min(a, b) + + We always have u1 > 0, and u0 == 0 is possible only if u1 == 1, + in which case the only reduction done so far is a = A - k B for + some k. + + Compute g = u a + v b = (u u1 - v u0) A + (...) B + Here, u, v are bounded by + + |u| <= b, + |v| <= a + */ + + ASSERT ( (ap[n-1] | bp[n-1]) > 0); + + if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) + { + /* Must return the smallest cofactor, +u1 or -u0 */ + int c; + + MPN_COPY (gp, ap, n); + + MPN_CMP (c, u0, u1, un); + /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in + this case we choose the cofactor + 1, corresponding to G = A + - k B, rather than -1, corresponding to G = - A + (k+1) B. */ + ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); + if (c < 0) + { + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + *usizep = -un; + } + else + { + MPN_NORMALIZE_NOT_ZERO (u1, un); + MPN_COPY (up, u1, un); + *usizep = un; + } + + TMP_FREE; + return n; + } + else if (UNLIKELY (u0[0] == 0) && un == 1) + { + mp_size_t gn; + ASSERT (u1[0] == 1); + + /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ + gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); + + TMP_FREE; + return gn; + } + else + { + mp_size_t u0n; + mp_size_t u1n; + mp_size_t lehmer_un; + mp_size_t lehmer_vn; + mp_size_t gn; + + mp_ptr lehmer_up; + mp_ptr lehmer_vp; + int negate; + + lehmer_up = tp; tp += n; + + /* Call mpn_gcdext_lehmer_n with copies of a and b. */ + MPN_COPY (tp, ap, n); + MPN_COPY (tp + n, bp, n); + gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); + + u0n = un; + MPN_NORMALIZE (u0, u0n); + ASSERT (u0n > 0); + + if (lehmer_un == 0) + { + /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ + MPN_COPY (up, u0, u0n); + *usizep = -u0n; + + TMP_FREE; + return gn; + } + + lehmer_vp = tp; + /* Compute v = (g - u a) / b */ + lehmer_vn = compute_v (lehmer_vp, + ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); + + if (lehmer_un > 0) + negate = 0; + else + { + lehmer_un = -lehmer_un; + negate = 1; + } + + u1n = un; + MPN_NORMALIZE (u1, u1n); + ASSERT (u1n > 0); + + ASSERT (lehmer_un + u1n <= ualloc); + ASSERT (lehmer_vn + u0n <= ualloc); + + /* We may still have v == 0 */ + + /* Compute u u0 */ + if (lehmer_un <= u1n) + /* Should be the common case */ + mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); + else + mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); + + un = u1n + lehmer_un; + un -= (up[un - 1] == 0); + + if (lehmer_vn > 0) + { + mp_limb_t cy; + + /* Overwrites old u1 value */ + if (lehmer_vn <= u0n) + /* Should be the common case */ + mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); + else + mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); + + u1n = u0n + lehmer_vn; + u1n -= (u1[u1n - 1] == 0); + + if (u1n <= un) + { + cy = mpn_add (up, up, un, u1, u1n); + } + else + { + cy = mpn_add (up, u1, u1n, up, un); + un = u1n; + } + up[un] = cy; + un += (cy != 0); + + ASSERT (un < ualloc); + } + *usizep = negate ? -un : un; + + TMP_FREE; + return gn; + } +} diff --git a/gmp-6.3.0/mpn/generic/gcdext_1.c b/gmp-6.3.0/mpn/generic/gcdext_1.c new file mode 100644 index 0000000..b221a92 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcdext_1.c @@ -0,0 +1,275 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright 1996, 1998, 2000-2005, 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef GCDEXT_1_USE_BINARY +#define GCDEXT_1_USE_BINARY 0 +#endif + +#ifndef GCDEXT_1_BINARY_METHOD +#define GCDEXT_1_BINARY_METHOD 2 +#endif + +#if GCDEXT_1_USE_BINARY + +mp_limb_t +mpn_gcdext_1 (mp_limb_signed_t *sp, mp_limb_signed_t *tp, + mp_limb_t u, mp_limb_t v) +{ + /* Maintain + + U = t1 u + t0 v + V = s1 u + s0 v + + where U, V are the inputs (without any shared power of two), + and the matrix has determinant ± 2^{shift}. + */ + mp_limb_t s0 = 1; + mp_limb_t t0 = 0; + mp_limb_t s1 = 0; + mp_limb_t t1 = 1; + mp_limb_t ug; + mp_limb_t vg; + mp_limb_t ugh; + mp_limb_t vgh; + unsigned zero_bits; + unsigned shift; + unsigned i; +#if GCDEXT_1_BINARY_METHOD == 2 + mp_limb_t det_sign; +#endif + + ASSERT (u > 0); + ASSERT (v > 0); + + count_trailing_zeros (zero_bits, u | v); + u >>= zero_bits; + v >>= zero_bits; + + if ((u & 1) == 0) + { + count_trailing_zeros (shift, u); + u >>= shift; + t1 <<= shift; + } + else if ((v & 1) == 0) + { + count_trailing_zeros (shift, v); + v >>= shift; + s0 <<= shift; + } + else + shift = 0; + +#if GCDEXT_1_BINARY_METHOD == 1 + while (u != v) + { + unsigned count; + if (u > v) + { + u -= v; + + count_trailing_zeros (count, u); + u >>= count; + + t0 += t1; t1 <<= count; + s0 += s1; s1 <<= count; + } + else + { + v -= u; + + count_trailing_zeros (count, v); + v >>= count; + + t1 += t0; t0 <<= count; + s1 += s0; s0 <<= count; + } + shift += count; + } +#else +# if GCDEXT_1_BINARY_METHOD == 2 + u >>= 1; + v >>= 1; + + det_sign = 0; + + while (u != v) + { + unsigned count; + mp_limb_t d = u - v; + mp_limb_t vgtu = LIMB_HIGHBIT_TO_MASK (d); + mp_limb_t sx; + mp_limb_t tx; + + /* When v <= u (vgtu == 0), the updates are: + + (u; v) <-- ( (u - v) >> count; v) (det = +(1< 0, the updates are + + (u; v) <-- ( (v - u) >> count; u) (det = -(1<>= count; + t1 <<= count; + s1 <<= count; + shift += count; + } + u = (u << 1) + 1; +# else /* GCDEXT_1_BINARY_METHOD == 2 */ +# error Unknown GCDEXT_1_BINARY_METHOD +# endif +#endif + + /* Now u = v = g = gcd (u,v). Compute U/g and V/g */ + ug = t0 + t1; + vg = s0 + s1; + + ugh = ug/2 + (ug & 1); + vgh = vg/2 + (vg & 1); + + /* Now 2^{shift} g = s0 U - t0 V. Get rid of the power of two, using + s0 U - t0 V = (s0 + V/g) U - (t0 + U/g) V. */ + for (i = 0; i < shift; i++) + { + mp_limb_t mask = - ( (s0 | t0) & 1); + + s0 /= 2; + t0 /= 2; + s0 += mask & vgh; + t0 += mask & ugh; + } + + ASSERT_ALWAYS (s0 <= vg); + ASSERT_ALWAYS (t0 <= ug); + + if (s0 > vg - s0) + { + s0 -= vg; + t0 -= ug; + } +#if GCDEXT_1_BINARY_METHOD == 2 + /* Conditional negation. */ + s0 = (s0 ^ det_sign) - det_sign; + t0 = (t0 ^ det_sign) - det_sign; +#endif + *sp = s0; + *tp = -t0; + + return u << zero_bits; +} + +#else /* !GCDEXT_1_USE_BINARY */ + + +/* FIXME: Takes two single-word limbs. It could be extended to a + * function that accepts a bignum for the first input, and only + * returns the first co-factor. */ + +mp_limb_t +mpn_gcdext_1 (mp_limb_signed_t *up, mp_limb_signed_t *vp, + mp_limb_t a, mp_limb_t b) +{ + /* Maintain + + a = u0 A + v0 B + b = u1 A + v1 B + + where A, B are the original inputs. + */ + mp_limb_signed_t u0 = 1; + mp_limb_signed_t v0 = 0; + mp_limb_signed_t u1 = 0; + mp_limb_signed_t v1 = 1; + + ASSERT (a > 0); + ASSERT (b > 0); + + if (a < b) + goto divide_by_b; + + for (;;) + { + mp_limb_t q; + + q = a / b; + a -= q * b; + + if (a == 0) + { + *up = u1; + *vp = v1; + return b; + } + u0 -= q * u1; + v0 -= q * v1; + + divide_by_b: + q = b / a; + b -= q * a; + + if (b == 0) + { + *up = u0; + *vp = v0; + return a; + } + u1 -= q * u0; + v1 -= q * v0; + } +} +#endif /* !GCDEXT_1_USE_BINARY */ diff --git a/gmp-6.3.0/mpn/generic/gcdext_lehmer.c b/gmp-6.3.0/mpn/generic/gcdext_lehmer.c new file mode 100644 index 0000000..ea4e86d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcdext_lehmer.c @@ -0,0 +1,336 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Here, d is the index of the cofactor to update. FIXME: Could use qn + = 0 for the common case q = 1. */ +void +mpn_gcdext_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + struct gcdext_ctx *ctx = (struct gcdext_ctx *) p; + mp_size_t un = ctx->un; + + if (gp) + { + mp_srcptr up; + + ASSERT (gn > 0); + ASSERT (gp[gn-1] > 0); + + MPN_COPY (ctx->gp, gp, gn); + ctx->gn = gn; + + if (d < 0) + { + int c; + + /* Must return the smallest cofactor, +u1 or -u0 */ + MPN_CMP (c, ctx->u0, ctx->u1, un); + ASSERT (c != 0 || (un == 1 && ctx->u0[0] == 1 && ctx->u1[0] == 1)); + + d = c < 0; + } + + up = d ? ctx->u0 : ctx->u1; + + MPN_NORMALIZE (up, un); + MPN_COPY (ctx->up, up, un); + + *ctx->usize = d ? -un : un; + } + else + { + mp_limb_t cy; + mp_ptr u0 = ctx->u0; + mp_ptr u1 = ctx->u1; + + ASSERT (d >= 0); + + if (d) + MP_PTR_SWAP (u0, u1); + + qn -= (qp[qn-1] == 0); + + /* Update u0 += q * u1 */ + if (qn == 1) + { + mp_limb_t q = qp[0]; + + if (q == 1) + /* A common case. */ + cy = mpn_add_n (u0, u0, u1, un); + else + cy = mpn_addmul_1 (u0, u1, un, q); + } + else + { + mp_size_t u1n; + mp_ptr tp; + + u1n = un; + MPN_NORMALIZE (u1, u1n); + + if (u1n == 0) + return; + + /* Should always have u1n == un here, and u1 >= u0. The + reason is that we alternate adding u0 to u1 and u1 to u0 + (corresponding to subtractions a - b and b - a), and we + can get a large quotient only just after a switch, which + means that we'll add (a multiple of) the larger u to the + smaller. */ + + tp = ctx->tp; + + if (qn > u1n) + mpn_mul (tp, qp, qn, u1, u1n); + else + mpn_mul (tp, u1, u1n, qp, qn); + + u1n += qn; + u1n -= tp[u1n-1] == 0; + + if (u1n >= un) + { + cy = mpn_add (u0, tp, u1n, u0, un); + un = u1n; + } + else + /* Note: Unlikely case, maybe never happens? */ + cy = mpn_add (u0, u0, un, tp, u1n); + + } + u0[un] = cy; + ctx->un = un + (cy > 0); + } +} + +/* Temporary storage: 3*(n+1) for u. If hgcd2 succeeds, we need n for + the matrix-vector multiplication adjusting a, b. If hgcd fails, we + need at most n for the quotient and n+1 for the u update (reusing + the extra u). In all, 4n + 3. */ + +mp_size_t +mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize, + mp_ptr ap, mp_ptr bp, mp_size_t n, + mp_ptr tp) +{ + mp_size_t ualloc = n + 1; + + /* Keeps track of the second row of the reduction matrix + * + * M = (v0, v1 ; u0, u1) + * + * which correspond to the first column of the inverse + * + * M^{-1} = (u1, -v1; -u0, v0) + * + * This implies that + * + * a = u1 A (mod B) + * b = -u0 A (mod B) + * + * where A, B denotes the input values. + */ + + struct gcdext_ctx ctx; + mp_size_t un; + mp_ptr u0; + mp_ptr u1; + mp_ptr u2; + + MPN_ZERO (tp, 3*ualloc); + u0 = tp; tp += ualloc; + u1 = tp; tp += ualloc; + u2 = tp; tp += ualloc; + + u1[0] = 1; un = 1; + + ctx.gp = gp; + ctx.up = up; + ctx.usize = usize; + + /* FIXME: Handle n == 2 differently, after the loop? */ + while (n >= 2) + { + struct hgcd_matrix1 M; + mp_limb_t ah, al, bh, bl; + mp_limb_t mask; + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else if (n == 2) + { + /* We use the full inputs without truncation, so we can + safely shift left. */ + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[1], ap[0]); + al = ap[0] << shift; + bh = MPN_EXTRACT_NUMB (shift, bp[1], bp[0]); + bl = bp[0] << shift; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_nhgcd2 step */ + if (mpn_hgcd2 (ah, al, bh, bl, &M)) + { + n = mpn_matrix22_mul1_inverse_vector (&M, tp, ap, bp, n); + MP_PTR_SWAP (ap, tp); + un = mpn_hgcd_mul_matrix1_vector(&M, u2, u0, u1, un); + MP_PTR_SWAP (u0, u2); + } + else + { + /* mpn_hgcd2 has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + ctx.u0 = u0; + ctx.u1 = u1; + ctx.tp = u2; + ctx.un = un; + + /* Temporary storage n for the quotient and ualloc for the + new cofactor. */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); + if (n == 0) + return ctx.gn; + + un = ctx.un; + } + } + ASSERT_ALWAYS (ap[0] > 0); + ASSERT_ALWAYS (bp[0] > 0); + + if (ap[0] == bp[0]) + { + int c; + + /* Which cofactor to return now? Candidates are +u1 and -u0, + depending on which of a and b was most recently reduced, + which we don't keep track of. So compare and get the smallest + one. */ + + gp[0] = ap[0]; + + MPN_CMP (c, u0, u1, un); + ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); + if (c < 0) + { + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + *usize = -un; + } + else + { + MPN_NORMALIZE_NOT_ZERO (u1, un); + MPN_COPY (up, u1, un); + *usize = un; + } + return 1; + } + else + { + mp_limb_t uh, vh; + mp_limb_signed_t u; + mp_limb_signed_t v; + int negate; + + gp[0] = mpn_gcdext_1 (&u, &v, ap[0], bp[0]); + + /* Set up = u u1 - v u0. Keep track of size, un grows by one or + two limbs. */ + + if (u == 0) + { + ASSERT (v == 1); + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + *usize = -un; + return 1; + } + else if (v == 0) + { + ASSERT (u == 1); + MPN_NORMALIZE (u1, un); + MPN_COPY (up, u1, un); + *usize = un; + return 1; + } + else if (u > 0) + { + negate = 0; + ASSERT (v < 0); + v = -v; + } + else + { + negate = 1; + ASSERT (v > 0); + u = -u; + } + + uh = mpn_mul_1 (up, u1, un, u); + vh = mpn_addmul_1 (up, u0, un, v); + + if ( (uh | vh) > 0) + { + uh += vh; + up[un++] = uh; + if (uh < vh) + up[un++] = 1; + } + + MPN_NORMALIZE_NOT_ZERO (up, un); + + *usize = negate ? -un : un; + return 1; + } +} diff --git a/gmp-6.3.0/mpn/generic/get_d.c b/gmp-6.3.0/mpn/generic/get_d.c new file mode 100644 index 0000000..8bef128 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/get_d.c @@ -0,0 +1,438 @@ +/* mpn_get_d -- limbs to double conversion. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2003, 2004, 2007, 2009, 2010, 2012, 2018 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "config.h" + +#if HAVE_FLOAT_H +#include /* for DBL_MANT_DIG and FLT_RADIX */ +#endif + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef _GMP_IEEE_FLOATS +#define _GMP_IEEE_FLOATS 0 +#endif + +/* To force use of the generic C code for testing, put + "#define _GMP_IEEE_FLOATS 0" at this point. */ + + +/* In alpha gcc prior to 3.4, signed DI comparisons involving constants are + rearranged from "x < n" to "x+(-n) < 0", which is of course hopelessly + wrong if that addition overflows. + + The workaround here avoids this bug by ensuring n is not a literal constant. + Note that this is alpha specific. The offending transformation is/was in + alpha.c alpha_emit_conditional_branch() under "We want to use cmpcc/bcc". + + Bizarrely, this happens also with Cray cc on alphaev5-cray-unicosmk2.0.6.X, + and has the same solution. Don't know why or how. */ + +#if HAVE_HOST_CPU_FAMILY_alpha \ + && ((defined (__GNUC__) && ! __GMP_GNUC_PREREQ(3,4)) \ + || defined (_CRAY)) +static volatile const long CONST_1024 = 1024; +static volatile const long CONST_NEG_1023 = -1023; +static volatile const long CONST_NEG_1022_SUB_53 = -1022 - 53; +#else +#define CONST_1024 (1024) +#define CONST_NEG_1023 (-1023) +#define CONST_NEG_1022_SUB_53 (-1022 - 53) +#endif + + +/* Return the value {ptr,size}*2^exp, and negative if sign<0. Must have + size>=1, and a non-zero high limb ptr[size-1]. + + When we know the fp format, the result is truncated towards zero. This is + consistent with other gmp conversions, like mpz_set_f or mpz_set_q, and is + easy to implement and test. + + When we do not know the format, such truncation seems much harder. One + would need to defeat any rounding mode, including round-up. + + It's felt that GMP is not primarily concerned with hardware floats, and + really isn't enhanced by getting involved with hardware rounding modes + (which could even be some weird unknown style), so something unambiguous and + straightforward is best. + + + The IEEE code below is the usual case, it knows either a 32-bit or 64-bit + limb and is done with shifts and masks. The 64-bit case in particular + should come out nice and compact. + + The generic code used to work one bit at a time, which was not only slow, + but implicitly relied upon denorms for intermediates, since the lowest bits' + weight of a perfectly valid fp number underflows in non-denorm. Therefore, + the generic code now works limb-per-limb, initially creating a number x such + that 1 <= x <= BASE. (BASE is reached only as result of rounding.) Then + x's exponent is scaled with explicit code (not ldexp to avoid libm + dependency). It is a tap-dance to avoid underflow or overflow, beware! + + + Traps: + + Hardware traps for overflow to infinity, underflow to zero, or unsupported + denorms may or may not be taken. The IEEE code works bitwise and so + probably won't trigger them, the generic code works by float operations and + so probably will. This difference might be thought less than ideal, but + again its felt straightforward code is better than trying to get intimate + with hardware exceptions (of perhaps unknown nature). + + + Not done: + + mpz_get_d in the past handled size==1 with a cast limb->double. This might + still be worthwhile there (for up to the mantissa many bits), but for + mpn_get_d here, the cost of applying "exp" to the resulting exponent would + probably use up any benefit a cast may have over bit twiddling. Also, if + the exponent is pushed into denorm range then bit twiddling is the only + option, to ensure the desired truncation is obtained. + + + Other: + + For reference, note that HPPA 8000, 8200, 8500 and 8600 trap FCNV,UDW,DBL + to the kernel for values >= 2^63. This makes it slow, and worse the kernel + Linux (what versions?) apparently uses untested code in its trap handling + routines, and gets the sign wrong. We don't use such a limb-to-double + cast, neither in the IEEE or generic code. */ + + + +#undef FORMAT_RECOGNIZED + +double +mpn_get_d (mp_srcptr up, mp_size_t size, mp_size_t sign, long exp) +{ + int lshift, nbits; + mp_limb_t x, mhi, mlo; + + ASSERT (size >= 0); + ASSERT_MPN (up, size); + ASSERT (size == 0 || up[size-1] != 0); + + if (size == 0) + return 0.0; + + /* Adjust exp to a radix point just above {up,size}, guarding against + overflow. After this exp can of course be reduced to anywhere within + the {up,size} region without underflow. */ + if (UNLIKELY ((unsigned long) (GMP_NUMB_BITS * size) + > ((unsigned long) LONG_MAX - exp))) + { +#if _GMP_IEEE_FLOATS + goto ieee_infinity; +#endif + + /* generic */ + exp = LONG_MAX; + } + else + { + exp += GMP_NUMB_BITS * size; + } + +#if _GMP_IEEE_FLOATS + { + union ieee_double_extract u; + + up += size; + +#if GMP_LIMB_BITS == 64 + mlo = up[-1]; + count_leading_zeros (lshift, mlo); + + exp -= (lshift - GMP_NAIL_BITS) + 1; + mlo <<= lshift; + + nbits = GMP_LIMB_BITS - lshift; + + if (nbits < 53 && size > 1) + { + x = up[-2]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + + if (LIMBS_PER_DOUBLE >= 3 && nbits < 53 && size > 2) + { + x = up[-3]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + } + } + mhi = mlo >> (32 + 11); + mlo = mlo >> 11; /* later implicitly truncated to 32 bits */ +#endif +#if GMP_LIMB_BITS == 32 + x = *--up; + count_leading_zeros (lshift, x); + + exp -= (lshift - GMP_NAIL_BITS) + 1; + x <<= lshift; + mhi = x >> 11; + + if (lshift < 11) /* FIXME: never true if NUMB < 20 bits */ + { + /* All 20 bits in mhi */ + mlo = x << 21; + /* >= 1 bit in mlo */ + nbits = GMP_LIMB_BITS - lshift - 21; + } + else + { + if (size > 1) + { + nbits = GMP_LIMB_BITS - lshift; + + x = *--up, size--; + x <<= GMP_NAIL_BITS; + mhi |= x >> nbits >> 11; + + mlo = x << (GMP_LIMB_BITS - nbits - 11); + nbits = nbits + 11 - GMP_NAIL_BITS; + } + else + { + mlo = 0; + goto done; + } + } + + /* Now all needed bits in mhi have been accumulated. Add bits to mlo. */ + + if (LIMBS_PER_DOUBLE >= 2 && nbits < 32 && size > 1) + { + x = up[-1]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + + if (LIMBS_PER_DOUBLE >= 3 && nbits < 32 && size > 2) + { + x = up[-2]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + + if (LIMBS_PER_DOUBLE >= 4 && nbits < 32 && size > 3) + { + x = up[-3]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + } + } + } + + done:; + +#endif + if (UNLIKELY (exp >= CONST_1024)) + { + /* overflow, return infinity */ + ieee_infinity: + mhi = 0; + mlo = 0; + exp = 1024; + } + else if (UNLIKELY (exp <= CONST_NEG_1023)) + { + int rshift; + + if (LIKELY (exp <= CONST_NEG_1022_SUB_53)) + return 0.0; /* denorm underflows to zero */ + + rshift = -1022 - exp; + ASSERT (rshift > 0 && rshift < 53); +#if GMP_LIMB_BITS > 53 + mlo >>= rshift; + mhi = mlo >> 32; +#else + if (rshift >= 32) + { + mlo = mhi; + mhi = 0; + rshift -= 32; + } + lshift = GMP_LIMB_BITS - rshift; + mlo = (mlo >> rshift) | (rshift == 0 ? 0 : mhi << lshift); + mhi >>= rshift; +#endif + exp = -1023; + } + u.s.manh = mhi; + u.s.manl = mlo; + u.s.exp = exp + 1023; + u.s.sig = (sign < 0); + return u.d; + } +#define FORMAT_RECOGNIZED 1 +#endif + +#if HAVE_DOUBLE_VAX_D + { + union double_extract u; + + up += size; + + mhi = up[-1]; + + count_leading_zeros (lshift, mhi); + exp -= lshift; + mhi <<= lshift; + + mlo = 0; + if (size > 1) + { + mlo = up[-2]; + if (lshift != 0) + mhi += mlo >> (GMP_LIMB_BITS - lshift); + mlo <<= lshift; + + if (size > 2 && lshift > 8) + { + x = up[-3]; + mlo += x >> (GMP_LIMB_BITS - lshift); + } + } + + if (UNLIKELY (exp >= 128)) + { + /* overflow, return maximum number */ + mhi = 0xffffffff; + mlo = 0xffffffff; + exp = 127; + } + else if (UNLIKELY (exp < -128)) + { + return 0.0; /* underflows to zero */ + } + + u.s.man3 = mhi >> 24; /* drop msb, since implicit */ + u.s.man2 = mhi >> 8; + u.s.man1 = (mhi << 8) + (mlo >> 24); + u.s.man0 = mlo >> 8; + u.s.exp = exp + 128; + u.s.sig = sign < 0; + return u.d; + } +#define FORMAT_RECOGNIZED 1 +#endif + +#if ! FORMAT_RECOGNIZED + +#if !defined(GMP_DBL_MANT_BITS) +#if defined(DBL_MANT_DIG) && FLT_RADIX == 2 +#define GMP_DBL_MANT_BITS DBL_MANT_DIG +#else +/* FIXME: Chose a smarter default value. */ +#define GMP_DBL_MANT_BITS (16 * sizeof (double)) +#endif +#endif + + { /* Non-IEEE or strange limb size, generically convert + GMP_DBL_MANT_BITS bits. */ + mp_limb_t l; + int m; + mp_size_t i; + double d, weight; + unsigned long uexp; + + /* First generate an fp number disregarding exp, instead keeping things + within the numb base factor from 1, which should prevent overflow and + underflow even for the most exponent limited fp formats. */ + i = size - 1; + l = up[i]; + count_leading_zeros (m, l); + m = m + GMP_DBL_MANT_BITS - GMP_LIMB_BITS; + if (m < 0) + l &= GMP_NUMB_MAX << -m; + d = l; + for (weight = 1/MP_BASE_AS_DOUBLE; m > 0 && --i >= 0;) + { + l = up[i]; + m -= GMP_NUMB_BITS; + if (m < 0) + l &= GMP_NUMB_MAX << -m; + d += l * weight; + weight /= MP_BASE_AS_DOUBLE; + if (weight == 0) + break; + } + + /* Now apply exp. */ + exp -= GMP_NUMB_BITS; + if (exp > 0) + { + weight = 2.0; + uexp = exp; + } + else + { + weight = 0.5; + uexp = NEG_CAST (unsigned long, exp); + } +#if 1 + /* Square-and-multiply exponentiation. */ + if (uexp & 1) + d *= weight; + while (uexp >>= 1) + { + weight *= weight; + if (uexp & 1) + d *= weight; + } +#else + /* Plain exponentiation. */ + while (uexp > 0) + { + d *= weight; + uexp--; + } +#endif + + return sign >= 0 ? d : -d; + } +#endif +} diff --git a/gmp-6.3.0/mpn/generic/get_str.c b/gmp-6.3.0/mpn/generic/get_str.c new file mode 100644 index 0000000..19cc581 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/get_str.c @@ -0,0 +1,451 @@ +/* mpn_get_str -- Convert {UP,USIZE} to a base BASE string in STR. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE, EXCEPT mpn_get_str, ARE INTERNAL WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Conversion of U {up,un} to a string in base b. Internally, we convert to + base B = b^m, the largest power of b that fits a limb. Basic algorithms: + + A) Divide U repeatedly by B, generating a quotient and remainder, until the + quotient becomes zero. The remainders hold the converted digits. Digits + come out from right to left. (Used in mpn_bc_get_str.) + + B) Divide U by b^g, for g such that 1/b <= U/b^g < 1, generating a fraction. + Then develop digits by multiplying the fraction repeatedly by b. Digits + come out from left to right. (Currently not used herein, except for in + code for converting single limbs to individual digits.) + + C) Compute B^1, B^2, B^4, ..., B^s, for s such that B^s is just above + sqrt(U). Then divide U by B^s, generating quotient and remainder. + Recursively convert the quotient, then the remainder, using the + precomputed powers. Digits come out from left to right. (Used in + mpn_dc_get_str.) + + When using algorithm C, algorithm B might be suitable for basecase code, + since the required b^g power will be readily accessible. + + Optimization ideas: + 1. The recursive function of (C) could use less temporary memory. The powtab + allocation could be trimmed with some computation, and the tmp area could + be reduced, or perhaps eliminated if up is reused for both quotient and + remainder (it is currently used just for remainder). + 2. Store the powers of (C) in normalized form, with the normalization count. + Quotients will usually need to be left-shifted before each divide, and + remainders will either need to be left-shifted of right-shifted. + 3. In the code for developing digits from a single limb, we could avoid using + a full umul_ppmm except for the first (or first few) digits, provided base + is even. Subsequent digits can be developed using plain multiplication. + (This saves on register-starved machines (read x86) and on all machines + that generate the upper product half using a separate instruction (alpha, + powerpc, IA-64) or lacks such support altogether (sparc64, hppa64). + 4. Separate mpn_dc_get_str basecase code from code for small conversions. The + former code will have the exact right power readily available in the + powtab parameter for dividing the current number into a fraction. Convert + that using algorithm B. + 5. Completely avoid division. Compute the inverses of the powers now in + powtab instead of the actual powers. + 6. Decrease powtab allocation for even bases. E.g. for base 10 we could save + about 30% (1-log(5)/log(10)). + + Basic structure of (C): + mpn_get_str: + if POW2_P (n) + ... + else + if (un < GET_STR_PRECOMPUTE_THRESHOLD) + mpn_bx_get_str (str, base, up, un); + else + precompute_power_tables + mpn_dc_get_str + + mpn_dc_get_str: + mpn_tdiv_qr + if (qn < GET_STR_DC_THRESHOLD) + mpn_bc_get_str + else + mpn_dc_get_str + if (rn < GET_STR_DC_THRESHOLD) + mpn_bc_get_str + else + mpn_dc_get_str + + + The reason for the two threshold values is the cost of + precompute_power_tables. GET_STR_PRECOMPUTE_THRESHOLD will be + considerably larger than GET_STR_DC_THRESHOLD. */ + + +/* The x86s and m68020 have a quotient and remainder "div" instruction and + gcc recognises an adjacent "/" and "%" can be combined using that. + Elsewhere "/" and "%" are either separate instructions, or separate + libgcc calls (which unfortunately gcc as of version 3.0 doesn't combine). + A multiply and subtract should be faster than a "%" in those cases. */ +#if HAVE_HOST_CPU_FAMILY_x86 \ + || HAVE_HOST_CPU_m68020 \ + || HAVE_HOST_CPU_m68030 \ + || HAVE_HOST_CPU_m68040 \ + || HAVE_HOST_CPU_m68060 \ + || HAVE_HOST_CPU_m68360 /* CPU32 */ +#define udiv_qrnd_unnorm(q,r,n,d) \ + do { \ + mp_limb_t __q = (n) / (d); \ + mp_limb_t __r = (n) % (d); \ + (q) = __q; \ + (r) = __r; \ + } while (0) +#else +#define udiv_qrnd_unnorm(q,r,n,d) \ + do { \ + mp_limb_t __q = (n) / (d); \ + mp_limb_t __r = (n) - __q*(d); \ + (q) = __q; \ + (r) = __r; \ + } while (0) +#endif + + +/* Convert {up,un} to a string in base base, and put the result in str. + Generate len characters, possibly padding with zeros to the left. If len is + zero, generate as many characters as required. Return a pointer immediately + after the last digit of the result string. Complexity is O(un^2); intended + for small conversions. */ +static unsigned char * +mpn_bc_get_str (unsigned char *str, size_t len, + mp_ptr up, mp_size_t un, int base) +{ + mp_limb_t rl, ul; + unsigned char *s; + size_t l; + /* Allocate memory for largest possible string, given that we only get here + for operands with un < GET_STR_PRECOMPUTE_THRESHOLD and that the smallest + base is 3. 7/11 is an approximation to 1/log2(3). */ +#if TUNE_PROGRAM_BUILD +#define BUF_ALLOC (GET_STR_THRESHOLD_LIMIT * GMP_LIMB_BITS * 7 / 11) +#else +#define BUF_ALLOC (GET_STR_PRECOMPUTE_THRESHOLD * GMP_LIMB_BITS * 7 / 11) +#endif + unsigned char buf[BUF_ALLOC]; +#if TUNE_PROGRAM_BUILD + mp_limb_t rp[GET_STR_THRESHOLD_LIMIT]; +#else + mp_limb_t rp[GET_STR_PRECOMPUTE_THRESHOLD]; +#endif + + if (base == 10) + { + /* Special case code for base==10 so that the compiler has a chance to + optimize things. */ + + MPN_COPY (rp + 1, up, un); + + s = buf + BUF_ALLOC; + while (un > 1) + { + int i; + mp_limb_t frac, digit; + MPN_DIVREM_OR_PREINV_DIVREM_1 (rp, (mp_size_t) 1, rp + 1, un, + MP_BASES_BIG_BASE_10, + MP_BASES_BIG_BASE_INVERTED_10, + MP_BASES_NORMALIZATION_STEPS_10); + un -= rp[un] == 0; + frac = (rp[0] + 1) << GMP_NAIL_BITS; + s -= MP_BASES_CHARS_PER_LIMB_10; +#if HAVE_HOST_CPU_FAMILY_x86 + /* The code below turns out to be a bit slower for x86 using gcc. + Use plain code. */ + i = MP_BASES_CHARS_PER_LIMB_10; + do + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + while (--i); +#else + /* Use the fact that 10 in binary is 1010, with the lowest bit 0. + After a few umul_ppmm, we will have accumulated enough low zeros + to use a plain multiply. */ + if (MP_BASES_NORMALIZATION_STEPS_10 == 0) + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + if (MP_BASES_NORMALIZATION_STEPS_10 <= 1) + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + if (MP_BASES_NORMALIZATION_STEPS_10 <= 2) + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + if (MP_BASES_NORMALIZATION_STEPS_10 <= 3) + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + i = (MP_BASES_CHARS_PER_LIMB_10 - ((MP_BASES_NORMALIZATION_STEPS_10 < 4) + ? (4-MP_BASES_NORMALIZATION_STEPS_10) + : 0)); + frac = (frac + 0xf) >> 4; + do + { + frac *= 10; + digit = frac >> (GMP_LIMB_BITS - 4); + *s++ = digit; + frac &= (~(mp_limb_t) 0) >> 4; + } + while (--i); +#endif + s -= MP_BASES_CHARS_PER_LIMB_10; + } + + ul = rp[1]; + while (ul != 0) + { + udiv_qrnd_unnorm (ul, rl, ul, 10); + *--s = rl; + } + } + else /* not base 10 */ + { + unsigned chars_per_limb; + mp_limb_t big_base, big_base_inverted; + unsigned normalization_steps; + + chars_per_limb = mp_bases[base].chars_per_limb; + big_base = mp_bases[base].big_base; + big_base_inverted = mp_bases[base].big_base_inverted; + count_leading_zeros (normalization_steps, big_base); + + MPN_COPY (rp + 1, up, un); + + s = buf + BUF_ALLOC; + while (un > 1) + { + int i; + mp_limb_t frac; + MPN_DIVREM_OR_PREINV_DIVREM_1 (rp, (mp_size_t) 1, rp + 1, un, + big_base, big_base_inverted, + normalization_steps); + un -= rp[un] == 0; + frac = (rp[0] + 1) << GMP_NAIL_BITS; + s -= chars_per_limb; + i = chars_per_limb; + do + { + mp_limb_t digit; + umul_ppmm (digit, frac, frac, base); + *s++ = digit; + } + while (--i); + s -= chars_per_limb; + } + + ul = rp[1]; + while (ul != 0) + { + udiv_qrnd_unnorm (ul, rl, ul, base); + *--s = rl; + } + } + + l = buf + BUF_ALLOC - s; + while (l < len) + { + *str++ = 0; + len--; + } + while (l != 0) + { + *str++ = *s++; + l--; + } + return str; +} + + +/* Convert {UP,UN} to a string with a base as represented in POWTAB, and put + the string in STR. Generate LEN characters, possibly padding with zeros to + the left. If LEN is zero, generate as many characters as required. + Return a pointer immediately after the last digit of the result string. + This uses divide-and-conquer and is intended for large conversions. */ +static unsigned char * +mpn_dc_get_str (unsigned char *str, size_t len, + mp_ptr up, mp_size_t un, + const powers_t *powtab, mp_ptr tmp) +{ + if (BELOW_THRESHOLD (un, GET_STR_DC_THRESHOLD)) + { + if (un != 0) + str = mpn_bc_get_str (str, len, up, un, powtab->base); + else + { + while (len != 0) + { + *str++ = 0; + len--; + } + } + } + else + { + mp_ptr pwp, qp, rp; + mp_size_t pwn, qn; + mp_size_t sn; + + pwp = powtab->p; + pwn = powtab->n; + sn = powtab->shift; + + if (un < pwn + sn || (un == pwn + sn && mpn_cmp (up + sn, pwp, un - sn) < 0)) + { + str = mpn_dc_get_str (str, len, up, un, powtab - 1, tmp); + } + else + { + qp = tmp; /* (un - pwn + 1) limbs for qp */ + rp = up; /* pwn limbs for rp; overwrite up area */ + + mpn_tdiv_qr (qp, rp + sn, 0L, up + sn, un - sn, pwp, pwn); + qn = un - sn - pwn; qn += qp[qn] != 0; /* quotient size */ + + ASSERT (qn < pwn + sn || (qn == pwn + sn && mpn_cmp (qp + sn, pwp, pwn) < 0)); + + if (len != 0) + len = len - powtab->digits_in_base; + + str = mpn_dc_get_str (str, len, qp, qn, powtab - 1, tmp + qn); + str = mpn_dc_get_str (str, powtab->digits_in_base, rp, pwn + sn, powtab - 1, tmp); + } + } + return str; +} + +/* There are no leading zeros on the digits generated at str, but that's not + currently a documented feature. The current mpz_out_str and mpz_get_str + rely on it. */ + +size_t +mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un) +{ + mp_ptr powtab_mem; + powers_t powtab[GMP_LIMB_BITS]; + int pi; + size_t out_len; + mp_ptr tmp; + TMP_DECL; + + /* Special case zero, as the code below doesn't handle it. */ + if (un == 0) + { + str[0] = 0; + return 1; + } + + if (POW2_P (base)) + { + /* The base is a power of 2. Convert from most significant end. */ + mp_limb_t n1, n0; + int bits_per_digit = mp_bases[base].big_base; + int cnt; + int bit_pos; + mp_size_t i; + unsigned char *s = str; + mp_bitcnt_t bits; + + n1 = up[un - 1]; + count_leading_zeros (cnt, n1); + + /* BIT_POS should be R when input ends in least significant nibble, + R + bits_per_digit * n when input ends in nth least significant + nibble. */ + + bits = (mp_bitcnt_t) GMP_NUMB_BITS * un - cnt + GMP_NAIL_BITS; + cnt = bits % bits_per_digit; + if (cnt != 0) + bits += bits_per_digit - cnt; + bit_pos = bits - (mp_bitcnt_t) (un - 1) * GMP_NUMB_BITS; + + /* Fast loop for bit output. */ + i = un - 1; + for (;;) + { + bit_pos -= bits_per_digit; + while (bit_pos >= 0) + { + *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1); + bit_pos -= bits_per_digit; + } + i--; + if (i < 0) + break; + n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1); + n1 = up[i]; + bit_pos += GMP_NUMB_BITS; + *s++ = n0 | (n1 >> bit_pos); + } + + return s - str; + } + + /* General case. The base is not a power of 2. */ + + if (BELOW_THRESHOLD (un, GET_STR_PRECOMPUTE_THRESHOLD)) + return mpn_bc_get_str (str, (size_t) 0, up, un, base) - str; + + TMP_MARK; + + /* Allocate one large block for the powers of big_base. */ + powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un)); + + /* Compute a table of powers, were the largest power is >= sqrt(U). */ + size_t ndig; + mp_size_t xn; + DIGITS_IN_BASE_PER_LIMB (ndig, un, base); + xn = 1 + ndig / mp_bases[base].chars_per_limb; /* FIXME: scalar integer division */ + + pi = 1 + mpn_compute_powtab (powtab, powtab_mem, xn, base); + + /* Using our precomputed powers, now in powtab[], convert our number. */ + tmp = TMP_BALLOC_LIMBS (mpn_dc_get_str_itch (un)); + out_len = mpn_dc_get_str (str, 0, up, un, powtab + (pi - 1), tmp) - str; + TMP_FREE; + + return out_len; +} diff --git a/gmp-6.3.0/mpn/generic/gmp-mparam.h b/gmp-6.3.0/mpn/generic/gmp-mparam.h new file mode 100644 index 0000000..7dc057a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gmp-mparam.h @@ -0,0 +1,33 @@ +/* Generic C gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* Values for GMP_LIMB_BITS etc will be determined by ./configure and put + in config.h. */ diff --git a/gmp-6.3.0/mpn/generic/hgcd.c b/gmp-6.3.0/mpn/generic/hgcd.c new file mode 100644 index 0000000..e3e9c66 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd.c @@ -0,0 +1,182 @@ +/* hgcd.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Size analysis for hgcd: + + For the recursive calls, we have n1 <= ceil(n / 2). Then the + storage need is determined by the storage for the recursive call + computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1 + (after this, the storage needed for M1 can be recycled). + + Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1) + = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2, + and for the hgcd_matrix_mul, we may need 3 ceil(n/2) + 8. In total, + 4 * ceil(n/4) + 3 ceil(n/2) + 12 <= 10 ceil(n/4) + 12. + + For the recursive call, we need S(n1) = S(ceil(n/2)). + + S(n) <= 10*ceil(n/4) + 12 + S(ceil(n/2)) + <= 10*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 12k + S(ceil(n/2^k)) + <= 10*(2 ceil(n/4) + k) + 12k + S(ceil(n/2^k)) + <= 20 ceil(n/4) + 22k + S(ceil(n/2^k)) +*/ + +mp_size_t +mpn_hgcd_itch (mp_size_t n) +{ + unsigned k; + int count; + mp_size_t nscaled; + + if (BELOW_THRESHOLD (n, HGCD_THRESHOLD)) + return n; + + /* Get the recursion depth. */ + nscaled = (n - 1) / (HGCD_THRESHOLD - 1); + count_leading_zeros (count, nscaled); + k = GMP_LIMB_BITS - count; + + return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD; +} + +/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M + with elements of size at most (n+1)/2 - 1. Returns new size of a, + b, or zero if no reduction is possible. */ + +mp_size_t +mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n, + struct hgcd_matrix *M, mp_ptr tp) +{ + mp_size_t s = n/2 + 1; + + mp_size_t nn; + int success = 0; + + if (n <= s) + /* Happens when n <= 2, a fairly uninteresting case but exercised + by the random inputs of the testsuite. */ + return 0; + + ASSERT ((ap[n-1] | bp[n-1]) > 0); + + ASSERT ((n+1)/2 - 1 < M->alloc); + + if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD)) + { + mp_size_t n2 = (3*n)/4 + 1; + mp_size_t p = n/2; + + nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp); + if (nn) + { + n = nn; + success = 1; + } + + /* NOTE: It appears this loop never runs more than once (at + least when not recursing to hgcd_appr). */ + while (n > n2) + { + /* Needs n + 1 storage */ + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + return success ? n : 0; + + n = nn; + success = 1; + } + + if (n > s + 2) + { + struct hgcd_matrix M1; + mp_size_t scratch; + + p = 2*s - n + 1; + scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); + + mpn_hgcd_matrix_init(&M1, n - p, tp); + + /* FIXME: Should use hgcd_reduce, but that may require more + scratch space, which requires review. */ + + nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch); + if (nn > 0) + { + /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ + ASSERT (M->n + 2 >= M1.n); + + /* Furthermore, assume M ends with a quotient (1, q; 0, 1), + then either q or q + 1 is a correct quotient, and M1 will + start with either (1, 0; 1, 1) or (2, 1; 1, 1). This + rules out the case that the size of M * M1 is much + smaller than the expected M->n + M1->n. */ + + ASSERT (M->n + M1.n < M->alloc); + + /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1) + = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */ + n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch); + + /* We need a bound for of M->n + M1.n. Let n be the original + input size. Then + + ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2 + + and it follows that + + M.n + M1.n <= ceil(n/2) + 1 + + Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the + amount of needed scratch space. */ + mpn_hgcd_matrix_mul (M, &M1, tp + scratch); + success = 1; + } + } + } + + for (;;) + { + /* Needs s+3 < n */ + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + return success ? n : 0; + + n = nn; + success = 1; + } +} diff --git a/gmp-6.3.0/mpn/generic/hgcd2-div.h b/gmp-6.3.0/mpn/generic/hgcd2-div.h new file mode 100644 index 0000000..45ba453 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd2-div.h @@ -0,0 +1,504 @@ +/* hgcd2-div.h + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019, 2020 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 3 +#endif + +#ifndef HGCD2_DIV2_METHOD +#define HGCD2_DIV2_METHOD 2 +#endif + +#if HAVE_NATIVE_mpn_div_11 + +#define div1 mpn_div_11 +/* Single-limb division optimized for small quotients. + Returned value holds d0 = r, d1 = q. */ +mp_double_limb_t div1 (mp_limb_t, mp_limb_t); + +#elif HGCD2_DIV1_METHOD == 1 + +static inline mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + mp_double_limb_t res; + res.d1 = n0 / d0; + res.d0 = n0 - res.d1 * d0; + + return res; +} + +#elif HGCD2_DIV1_METHOD == 2 + +static mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + mp_double_limb_t res; + int ncnt, dcnt, cnt; + mp_limb_t q; + mp_limb_t mask; + + ASSERT (n0 >= d0); + + count_leading_zeros (ncnt, n0); + count_leading_zeros (dcnt, d0); + cnt = dcnt - ncnt; + + d0 <<= cnt; + + q = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & q; + d0 >>= 1; + q = -q; + + while (--cnt >= 0) + { + mask = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & mask; + d0 >>= 1; + q = (q << 1) - mask; + } + + res.d0 = n0; + res.d1 = q; + return res; +} + +#elif HGCD2_DIV1_METHOD == 3 + +static inline mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + mp_double_limb_t res; + if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0) + || UNLIKELY (n0 >= (d0 << 3))) + { + res.d1 = n0 / d0; + res.d0 = n0 - res.d1 * d0; + } + else + { + mp_limb_t q, mask; + + d0 <<= 2; + + mask = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & mask; + q = 4 & mask; + + d0 >>= 1; + mask = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & mask; + q += 2 & mask; + + d0 >>= 1; + mask = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & mask; + q -= mask; + + res.d0 = n0; + res.d1 = q; + } + return res; +} + +#elif HGCD2_DIV1_METHOD == 4 + +/* Table quotients. We extract the NBITS most significant bits of the + numerator limb, and the corresponding bits from the divisor limb, and use + these to form an index into the table. This method is probably only useful + for short pipelines with slow multiplication. + + Possible improvements: + + * Perhaps extract the highest NBITS of the divisor instead of the same bits + as from the numerator. That would require another count_leading_zeros, + and a post-multiply shift of the quotient. + + * Compress tables? Their values are tiny, and there are lots of zero + entries (which are never used). + + * Round the table entries more cleverly? +*/ + +#ifndef NBITS +#define NBITS 5 +#endif + +#if NBITS == 5 +/* This needs full division about 13.2% of the time. */ +static const unsigned char tab[512] = { +17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, +20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, +21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, +22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, +23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, +24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, +25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, +26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, +27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, +28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, +29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, +30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, +31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, +32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; +#elif NBITS == 6 +/* This needs full division about 9.8% of the time. */ +static const unsigned char tab[2048] = { +33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, +52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, +53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, +54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, +55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, +56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, +57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, +58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, +59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, +60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, +61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, +62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, +63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, +64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; +#else +#error No table for provided NBITS +#endif + +/* Doing tabp with a #define makes compiler warnings about pointing outside an + object go away. We used to define this as a variable. It is not clear if + e.g. (vector[100] - 10) + 10 is well- defined as per the C standard; + (vector[100] + 10) - 10 surely is and there is no sequence point so the + expressions should be equivalent. To make this safe, we might want to + define tabp as a macro with the index as an argument. Depending on the + platform, relocs might allow for assembly-time or linker-time resolution to + take place. */ +#define tabp (tab - (1 << (NBITS - 1) << NBITS)) + +static inline mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + int ncnt; + size_t nbi, dbi; + mp_limb_t q0; + mp_limb_t r0; + mp_limb_t mask; + mp_double_limb_t res; + + ASSERT (n0 >= d0); /* Actually only msb position is critical. */ + + count_leading_zeros (ncnt, n0); + nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS); + dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS); + + q0 = tabp[(nbi << NBITS) + dbi]; + r0 = n0 - q0 * d0; + mask = -(mp_limb_t) (r0 >= d0); + q0 -= mask; + r0 -= d0 & mask; + + if (UNLIKELY (r0 >= d0)) + { + q0 = n0 / d0; + r0 = n0 - q0 * d0; + } + + res.d1 = q0; + res.d0 = r0; + return res; +} + +#elif HGCD2_DIV1_METHOD == 5 + +/* Table inverses of divisors. We don't bother with suppressing the msb from + the tables. We index with the NBITS most significant divisor bits, + including the always-set highest bit, but use addressing trickery via tabp + to suppress it. + + Possible improvements: + + * Do first multiply using 32-bit operations on 64-bit computers. At least + on most Arm64 cores, that uses 3 times less resources. It also saves on + many x86-64 processors. +*/ + +#ifndef NBITS +#define NBITS 7 +#endif + +#if NBITS == 5 +/* This needs full division about 1.63% of the time. */ +static const unsigned char tab[16] = { + 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32 +}; +#elif NBITS == 6 +/* This needs full division about 0.93% of the time. */ +static const unsigned char tab[32] = { +127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86, + 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64 +}; +#elif NBITS == 7 +/* This needs full division about 0.49% of the time. */ +static const unsigned char tab[64] = { +255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206, +203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171, +169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146, +145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128 +}; +#elif NBITS == 8 +/* This needs full division about 0.26% of the time. */ +static const unsigned short tab[128] = { +511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457, +454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411, +408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373, +371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342, +340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315, +314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292, +291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273, +272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256 +}; +#else +#error No table for provided NBITS +#endif + +/* Doing tabp with a #define makes compiler warnings about pointing outside an + object go away. We used to define this as a variable. It is not clear if + e.g. (vector[100] - 10) + 10 is well- defined as per the C standard; + (vector[100] + 10) - 10 surely is and there is no sequence point so the + expressions should be equivalent. To make this safe, we might want to + define tabp as a macro with the index as an argument. Depending on the + platform, relocs might allow for assembly-time or linker-time resolution to + take place. */ +#define tabp (tab - (1 << (NBITS - 1))) + +static inline mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + int ncnt, dcnt; + size_t dbi; + mp_limb_t inv; + mp_limb_t q0; + mp_limb_t r0; + mp_limb_t mask; + mp_double_limb_t res; + + count_leading_zeros (ncnt, n0); + count_leading_zeros (dcnt, d0); + + dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS); + inv = tabp[dbi]; + q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt); + r0 = n0 - q0 * d0; + mask = -(mp_limb_t) (r0 >= d0); + q0 -= mask; + r0 -= d0 & mask; + + if (UNLIKELY (r0 >= d0)) + { + q0 = n0 / d0; + r0 = n0 - q0 * d0; + } + + res.d1 = q0; + res.d0 = r0; + return res; +} + +#else +#error Unknown HGCD2_DIV1_METHOD +#endif + +#if HAVE_NATIVE_mpn_div_22 + +#define div2 mpn_div_22 +/* Two-limb division optimized for small quotients. */ +mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); + +#elif HGCD2_DIV2_METHOD == 1 + +static mp_limb_t +div2 (mp_ptr rp, + mp_limb_t n1, mp_limb_t n0, + mp_limb_t d1, mp_limb_t d0) +{ + mp_double_limb_t rq = div1 (n1, d1); + if (UNLIKELY (rq.d1 > d1)) + { + mp_limb_t n2, q, t1, t0; + int c; + + /* Normalize */ + count_leading_zeros (c, d1); + ASSERT (c > 0); + + n2 = n1 >> (GMP_LIMB_BITS - c); + n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c)); + n0 <<= c; + d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c)); + d0 <<= c; + + udiv_qrnnd (q, n1, n2, n1, d1); + umul_ppmm (t1, t0, q, d0); + if (t1 > n1 || (t1 == n1 && t0 > n0)) + { + ASSERT (q > 0); + q--; + sub_ddmmss (t1, t0, t1, t0, d1, d0); + } + sub_ddmmss (n1, n0, n1, n0, t1, t0); + + /* Undo normalization */ + rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c)); + rp[1] = n1 >> c; + + return q; + } + else + { + mp_limb_t q, t1, t0; + n1 = rq.d0; + q = rq.d1; + umul_ppmm (t1, t0, q, d0); + if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0)) + { + ASSERT (q > 0); + q--; + sub_ddmmss (t1, t0, t1, t0, d1, d0); + } + sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0); + return q; + } +} + +#elif HGCD2_DIV2_METHOD == 2 + +/* Bit-wise div2. Relies on fast count_leading_zeros. */ +static mp_limb_t +div2 (mp_ptr rp, + mp_limb_t n1, mp_limb_t n0, + mp_limb_t d1, mp_limb_t d0) +{ + mp_limb_t q = 0; + int ncnt; + int dcnt; + + count_leading_zeros (ncnt, n1); + count_leading_zeros (dcnt, d1); + dcnt -= ncnt; + + d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt)); + d0 <<= dcnt; + + do + { + mp_limb_t mask; + q <<= 1; + if (UNLIKELY (n1 == d1)) + mask = -(n0 >= d0); + else + mask = -(n1 > d1); + + q -= mask; + + sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0); + + d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); + d1 = d1 >> 1; + } + while (dcnt--); + + rp[0] = n0; + rp[1] = n1; + + return q; +} +#else +#error Unknown HGCD2_DIV2_METHOD +#endif diff --git a/gmp-6.3.0/mpn/generic/hgcd2.c b/gmp-6.3.0/mpn/generic/hgcd2.c new file mode 100644 index 0000000..43d4d48 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd2.c @@ -0,0 +1,283 @@ +/* hgcd2.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/generic/hgcd2-div.h" + +#if GMP_NAIL_BITS != 0 +#error Nails not implemented +#endif + +/* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs + matrix M. Returns 1 if we make progress, i.e. can perform at least + one subtraction. Otherwise returns zero. */ + +/* FIXME: Possible optimizations: + + The div2 function starts with checking the most significant bit of + the numerator. We can maintained normalized operands here, call + hgcd with normalized operands only, which should make the code + simpler and possibly faster. + + Experiment with table lookups on the most significant bits. + + This function is also a candidate for assembler implementation. +*/ +int +mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, + struct hgcd_matrix1 *M) +{ + mp_limb_t u00, u01, u10, u11; + + if (ah < 2 || bh < 2) + return 0; + + if (ah > bh || (ah == bh && al > bl)) + { + sub_ddmmss (ah, al, ah, al, bh, bl); + if (ah < 2) + return 0; + + u00 = u01 = u11 = 1; + u10 = 0; + } + else + { + sub_ddmmss (bh, bl, bh, bl, ah, al); + if (bh < 2) + return 0; + + u00 = u10 = u11 = 1; + u01 = 0; + } + + if (ah < bh) + goto subtract_a; + + for (;;) + { + ASSERT (ah >= bh); + if (ah == bh) + goto done; + + if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) + { + ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); + bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); + + break; + } + + /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 + 1), affecting the second column of M. */ + ASSERT (ah > bh); + sub_ddmmss (ah, al, ah, al, bh, bl); + + if (ah < 2) + goto done; + + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, ah, al, bh, bl); + al = r[0]; ah = r[1]; + if (ah < 2) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + goto done; + } + q++; + u01 += q * u00; + u11 += q * u10; + } + subtract_a: + ASSERT (bh >= ah); + if (ah == bh) + goto done; + + if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) + { + ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); + bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); + + goto subtract_a1; + } + + /* Subtract b -= q a, and multiply M from the right by (1 0 ; q + 1), affecting the first column of M. */ + sub_ddmmss (bh, bl, bh, bl, ah, al); + + if (bh < 2) + goto done; + + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, bh, bl, ah, al); + bl = r[0]; bh = r[1]; + if (bh < 2) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + goto done; + } + q++; + u00 += q * u01; + u10 += q * u11; + } + } + + /* NOTE: Since we discard the least significant half limb, we don't get a + truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */ + /* Single precision loop */ + for (;;) + { + ASSERT (ah >= bh); + + ah -= bh; + if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) + break; + + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + } + else + { + mp_double_limb_t rq = div1 (ah, bh); + mp_limb_t q = rq.d1; + ah = rq.d0; + + if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + break; + } + q++; + u01 += q * u00; + u11 += q * u10; + } + subtract_a1: + ASSERT (bh >= ah); + + bh -= ah; + if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) + break; + + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + } + else + { + mp_double_limb_t rq = div1 (bh, ah); + mp_limb_t q = rq.d1; + bh = rq.d0; + + if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + break; + } + q++; + u00 += q * u01; + u10 += q * u11; + } + } + + done: + M->u[0][0] = u00; M->u[0][1] = u01; + M->u[1][0] = u10; M->u[1][1] = u11; + + return 1; +} + +/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must + * have space for n + 1 limbs. Uses three buffers to avoid a copy*/ +mp_size_t +mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M, + mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) +{ + mp_limb_t ah, bh; + + /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as + + r = u00 * a + r += u10 * b + b *= u11 + b += u01 * a + */ + +#if HAVE_NATIVE_mpn_addaddmul_1msb0 + ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]); + bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]); +#else + ah = mpn_mul_1 (rp, ap, n, M->u[0][0]); + ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]); + + bh = mpn_mul_1 (bp, bp, n, M->u[1][1]); + bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]); +#endif + rp[n] = ah; + bp[n] = bh; + + n += (ah | bh) > 0; + return n; +} diff --git a/gmp-6.3.0/mpn/generic/hgcd2_jacobi.c b/gmp-6.3.0/mpn/generic/hgcd2_jacobi.c new file mode 100644 index 0000000..95d4af1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd2_jacobi.c @@ -0,0 +1,251 @@ +/* hgcd2_jacobi.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2011, 2020 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/generic/hgcd2-div.h" + +#if GMP_NAIL_BITS != 0 +#error Nails not implemented +#endif + +int +mpn_hgcd2_jacobi (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, + struct hgcd_matrix1 *M, unsigned *bitsp) +{ + mp_limb_t u00, u01, u10, u11; + unsigned bits = *bitsp; + + if (ah < 2 || bh < 2) + return 0; + + if (ah > bh || (ah == bh && al > bl)) + { + sub_ddmmss (ah, al, ah, al, bh, bl); + if (ah < 2) + return 0; + + u00 = u01 = u11 = 1; + u10 = 0; + bits = mpn_jacobi_update (bits, 1, 1); + } + else + { + sub_ddmmss (bh, bl, bh, bl, ah, al); + if (bh < 2) + return 0; + + u00 = u10 = u11 = 1; + u01 = 0; + bits = mpn_jacobi_update (bits, 0, 1); + } + + if (ah < bh) + goto subtract_a; + + for (;;) + { + ASSERT (ah >= bh); + if (ah == bh) + goto done; + + if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) + { + ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); + bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); + + break; + } + + /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 + 1), affecting the second column of M. */ + ASSERT (ah > bh); + sub_ddmmss (ah, al, ah, al, bh, bl); + + if (ah < 2) + goto done; + + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + bits = mpn_jacobi_update (bits, 1, 1); + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, ah, al, bh, bl); + al = r[0]; ah = r[1]; + if (ah < 2) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + bits = mpn_jacobi_update (bits, 1, q & 3); + goto done; + } + q++; + u01 += q * u00; + u11 += q * u10; + bits = mpn_jacobi_update (bits, 1, q & 3); + } + subtract_a: + ASSERT (bh >= ah); + if (ah == bh) + goto done; + + if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) + { + ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); + bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); + + goto subtract_a1; + } + + /* Subtract b -= q a, and multiply M from the right by (1 0 ; q + 1), affecting the first column of M. */ + sub_ddmmss (bh, bl, bh, bl, ah, al); + + if (bh < 2) + goto done; + + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + bits = mpn_jacobi_update (bits, 0, 1); + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, bh, bl, ah, al); + bl = r[0]; bh = r[1]; + if (bh < 2) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + bits = mpn_jacobi_update (bits, 0, q & 3); + goto done; + } + q++; + u00 += q * u01; + u10 += q * u11; + bits = mpn_jacobi_update (bits, 0, q & 3); + } + } + + /* NOTE: Since we discard the least significant half limb, we don't get a + truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */ + /* Single precision loop */ + for (;;) + { + ASSERT (ah >= bh); + + ah -= bh; + if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) + break; + + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + bits = mpn_jacobi_update (bits, 1, 1); + } + else + { + mp_double_limb_t rq = div1 (ah, bh); + mp_limb_t q = rq.d1; + ah = rq.d0; + + if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + bits = mpn_jacobi_update (bits, 1, q & 3); + break; + } + q++; + u01 += q * u00; + u11 += q * u10; + bits = mpn_jacobi_update (bits, 1, q & 3); + } + subtract_a1: + ASSERT (bh >= ah); + + bh -= ah; + if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) + break; + + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + bits = mpn_jacobi_update (bits, 0, 1); + } + else + { + mp_double_limb_t rq = div1 (bh, ah); + mp_limb_t q = rq.d1; + bh = rq.d0; + + if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + bits = mpn_jacobi_update (bits, 0, q & 3); + break; + } + q++; + u00 += q * u01; + u10 += q * u11; + bits = mpn_jacobi_update (bits, 0, q & 3); + } + } + + done: + M->u[0][0] = u00; M->u[0][1] = u01; + M->u[1][0] = u10; M->u[1][1] = u11; + *bitsp = bits; + + return 1; +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_appr.c b/gmp-6.3.0/mpn/generic/hgcd_appr.c new file mode 100644 index 0000000..bb01738 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_appr.c @@ -0,0 +1,267 @@ +/* hgcd_appr.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Identical to mpn_hgcd_itch. FIXME: Do we really need to add + HGCD_THRESHOLD at the end? */ +mp_size_t +mpn_hgcd_appr_itch (mp_size_t n) +{ + if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD)) + return n; + else + { + unsigned k; + int count; + mp_size_t nscaled; + + /* Get the recursion depth. */ + nscaled = (n - 1) / (HGCD_APPR_THRESHOLD - 1); + count_leading_zeros (count, nscaled); + k = GMP_LIMB_BITS - count; + + return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD; + } +} + +/* Destroys inputs. */ +int +mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n, + struct hgcd_matrix *M, mp_ptr tp) +{ + mp_size_t s; + int success = 0; + + ASSERT (n > 0); + + ASSERT ((ap[n-1] | bp[n-1]) != 0); + + if (n <= 2) + /* Implies s = n. A fairly uninteresting case but exercised by the + random inputs of the testsuite. */ + return 0; + + ASSERT ((n+1)/2 - 1 < M->alloc); + + /* We aim for reduction of to GMP_NUMB_BITS * s bits. But each time + we discard some of the least significant limbs, we must keep one + additional bit to account for the truncation error. We maintain + the GMP_NUMB_BITS * s - extra_bits as the current target size. */ + + s = n/2 + 1; + if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD)) + { + unsigned extra_bits = 0; + + while (n > 2) + { + mp_size_t nn; + + ASSERT (n > s); + ASSERT (n <= 2*s); + + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + break; + + n = nn; + success = 1; + + /* We can truncate and discard the lower p bits whenever nbits <= + 2*sbits - p. To account for the truncation error, we must + adjust + + sbits <-- sbits + 1 - p, + + rather than just sbits <-- sbits - p. This adjustment makes + the produced matrix slightly smaller than it could be. */ + + if (GMP_NUMB_BITS * (n + 1) + 2 * extra_bits <= 2*GMP_NUMB_BITS * s) + { + mp_size_t p = (GMP_NUMB_BITS * (2*s - n) - 2*extra_bits) / GMP_NUMB_BITS; + + if (extra_bits == 0) + { + /* We cross a limb boundary and bump s. We can't do that + if the result is that it makes makes min(U, V) + smaller than 2^{GMP_NUMB_BITS} s. */ + if (s + 1 == n + || mpn_zero_p (ap + s + 1, n - s - 1) + || mpn_zero_p (bp + s + 1, n - s - 1)) + continue; + + extra_bits = GMP_NUMB_BITS - 1; + s++; + } + else + { + extra_bits--; + } + + /* Drop the p least significant limbs */ + ap += p; bp += p; n -= p; s -= p; + } + } + + ASSERT (s > 0); + + if (extra_bits > 0) + { + /* We can get here only of we have dropped at least one of the least + significant bits, so we can decrement ap and bp. We can then shift + left extra bits using mpn_rshift. */ + /* NOTE: In the unlikely case that n is large, it would be preferable + to do an initial subdiv step to reduce the size before shifting, + but that would mean duplicating mpn_gcd_subdiv_step with a bit + count rather than a limb count. */ + ap--; bp--; + ap[0] = mpn_rshift (ap+1, ap+1, n, GMP_NUMB_BITS - extra_bits); + bp[0] = mpn_rshift (bp+1, bp+1, n, GMP_NUMB_BITS - extra_bits); + n += (ap[n] | bp[n]) > 0; + + ASSERT (success); + + while (n > 2) + { + mp_size_t nn; + + ASSERT (n > s); + ASSERT (n <= 2*s); + + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + + if (!nn) + return 1; + + n = nn; + } + } + + if (n == 2) + { + struct hgcd_matrix1 M1; + ASSERT (s == 1); + + if (mpn_hgcd2 (ap[1], ap[0], bp[1], bp[0], &M1)) + { + /* Multiply M <- M * M1 */ + mpn_hgcd_matrix_mul_1 (M, &M1, tp); + success = 1; + } + } + return success; + } + else + { + mp_size_t n2 = (3*n)/4 + 1; + mp_size_t p = n/2; + mp_size_t nn; + + nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp); + if (nn) + { + n = nn; + /* FIXME: Discard some of the low limbs immediately? */ + success = 1; + } + + while (n > n2) + { + mp_size_t nn; + + /* Needs n + 1 storage */ + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + return success; + + n = nn; + success = 1; + } + if (n > s + 2) + { + struct hgcd_matrix M1; + mp_size_t scratch; + + p = 2*s - n + 1; + scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); + + mpn_hgcd_matrix_init(&M1, n - p, tp); + if (mpn_hgcd_appr (ap + p, bp + p, n - p, &M1, tp + scratch)) + { + /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ + ASSERT (M->n + 2 >= M1.n); + + /* Furthermore, assume M ends with a quotient (1, q; 0, 1), + then either q or q + 1 is a correct quotient, and M1 will + start with either (1, 0; 1, 1) or (2, 1; 1, 1). This + rules out the case that the size of M * M1 is much + smaller than the expected M->n + M1->n. */ + + ASSERT (M->n + M1.n < M->alloc); + + /* We need a bound for of M->n + M1.n. Let n be the original + input size. Then + + ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2 + + and it follows that + + M.n + M1.n <= ceil(n/2) + 1 + + Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the + amount of needed scratch space. */ + mpn_hgcd_matrix_mul (M, &M1, tp + scratch); + return 1; + } + } + + for(;;) + { + mp_size_t nn; + + ASSERT (n > s); + ASSERT (n <= 2*s); + + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + + if (!nn) + return success; + + n = nn; + success = 1; + } + } +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_jacobi.c b/gmp-6.3.0/mpn/generic/hgcd_jacobi.c new file mode 100644 index 0000000..24014ce --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_jacobi.c @@ -0,0 +1,243 @@ +/* hgcd_jacobi.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* This file is almost a copy of hgcd.c, with some added calls to + mpn_jacobi_update */ + +struct hgcd_jacobi_ctx +{ + struct hgcd_matrix *M; + unsigned *bitsp; +}; + +static void +hgcd_jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + ASSERT (!gp); + ASSERT (d >= 0); + + MPN_NORMALIZE (qp, qn); + if (qn > 0) + { + struct hgcd_jacobi_ctx *ctx = (struct hgcd_jacobi_ctx *) p; + /* NOTES: This is a bit ugly. A tp area is passed to + gcd_subdiv_step, which stores q at the start of that area. We + now use the rest. */ + mp_ptr tp = (mp_ptr) qp + qn; + + mpn_hgcd_matrix_update_q (ctx->M, qp, qn, d, tp); + *ctx->bitsp = mpn_jacobi_update (*ctx->bitsp, d, qp[0] & 3); + } +} + +/* Perform a few steps, using some of mpn_hgcd2, subtraction and + division. Reduces the size by almost one limb or more, but never + below the given size s. Return new size for a and b, or 0 if no + more steps are possible. + + If hgcd2 succeeds, needs temporary space for hgcd_matrix_mul_1, M->n + limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2 + fails, needs space for the quotient, qn <= n - s + 1 limbs, for and + hgcd_matrix_update_q, qn + (size of the appropriate column of M) <= + resulting size of M. + + If N is the input size to the calling hgcd, then s = floor(N/2) + + 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1 + < N, so N is sufficient. +*/ + +static mp_size_t +hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, + struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp) +{ + struct hgcd_matrix1 M1; + mp_limb_t mask; + mp_limb_t ah, al, bh, bl; + + ASSERT (n > s); + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (n == s + 1) + { + if (mask < 4) + goto subtract; + + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_hgcd2 step */ + if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M1, bitsp)) + { + /* Multiply M <- M * M1 */ + mpn_hgcd_matrix_mul_1 (M, &M1, tp); + + /* Can't swap inputs, so we need to copy. */ + MPN_COPY (tp, ap, n); + /* Multiply M1^{-1} (a;b) */ + return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n); + } + + subtract: + { + struct hgcd_jacobi_ctx ctx; + ctx.M = M; + ctx.bitsp = bitsp; + + return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp); + } +} + +/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M + with elements of size at most (n+1)/2 - 1. Returns new size of a, + b, or zero if no reduction is possible. */ + +/* Same scratch requirements as for mpn_hgcd. */ +mp_size_t +mpn_hgcd_jacobi (mp_ptr ap, mp_ptr bp, mp_size_t n, + struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp) +{ + mp_size_t s = n/2 + 1; + + mp_size_t nn; + int success = 0; + + if (n <= s) + /* Happens when n <= 2, a fairly uninteresting case but exercised + by the random inputs of the testsuite. */ + return 0; + + ASSERT ((ap[n-1] | bp[n-1]) > 0); + + ASSERT ((n+1)/2 - 1 < M->alloc); + + if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD)) + { + mp_size_t n2 = (3*n)/4 + 1; + mp_size_t p = n/2; + + nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, M, bitsp, tp); + if (nn > 0) + { + /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) + = 2 (n - 1) */ + n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp); + success = 1; + } + while (n > n2) + { + /* Needs n + 1 storage */ + nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp); + if (!nn) + return success ? n : 0; + n = nn; + success = 1; + } + + if (n > s + 2) + { + struct hgcd_matrix M1; + mp_size_t scratch; + + p = 2*s - n + 1; + scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); + + mpn_hgcd_matrix_init(&M1, n - p, tp); + nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M1, bitsp, tp + scratch); + if (nn > 0) + { + /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ + ASSERT (M->n + 2 >= M1.n); + + /* Furthermore, assume M ends with a quotient (1, q; 0, 1), + then either q or q + 1 is a correct quotient, and M1 will + start with either (1, 0; 1, 1) or (2, 1; 1, 1). This + rules out the case that the size of M * M1 is much + smaller than the expected M->n + M1->n. */ + + ASSERT (M->n + M1.n < M->alloc); + + /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1) + = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */ + n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch); + + /* We need a bound for of M->n + M1.n. Let n be the original + input size. Then + + ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2 + + and it follows that + + M.n + M1.n <= ceil(n/2) + 1 + + Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the + amount of needed scratch space. */ + mpn_hgcd_matrix_mul (M, &M1, tp + scratch); + success = 1; + } + } + } + + for (;;) + { + /* Needs s+3 < n */ + nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp); + if (!nn) + return success ? n : 0; + + n = nn; + success = 1; + } +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_matrix.c b/gmp-6.3.0/mpn/generic/hgcd_matrix.c new file mode 100644 index 0000000..54c795d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_matrix.c @@ -0,0 +1,265 @@ +/* hgcd_matrix.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* For input of size n, matrix elements are of size at most ceil(n/2) + - 1, but we need two limbs extra. */ +void +mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p) +{ + mp_size_t s = (n+1)/2 + 1; + M->alloc = s; + M->n = 1; + MPN_ZERO (p, 4 * s); + M->p[0][0] = p; + M->p[0][1] = p + s; + M->p[1][0] = p + 2 * s; + M->p[1][1] = p + 3 * s; + + M->p[0][0][0] = M->p[1][1][0] = 1; +} + +/* Update column COL, adding in Q * column (1-COL). Temporary storage: + * qn + n <= M->alloc, where n is the size of the largest element in + * column 1 - COL. */ +void +mpn_hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn, + unsigned col, mp_ptr tp) +{ + ASSERT (col < 2); + + if (qn == 1) + { + mp_limb_t q = qp[0]; + mp_limb_t c0, c1; + + c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q); + c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q); + + M->p[0][col][M->n] = c0; + M->p[1][col][M->n] = c1; + + M->n += (c0 | c1) != 0; + } + else + { + unsigned row; + + /* Carries for the unlikely case that we get both high words + from the multiplication and carries from the addition. */ + mp_limb_t c[2]; + mp_size_t n; + + /* The matrix will not necessarily grow in size by qn, so we + need normalization in order not to overflow M. */ + + for (n = M->n; n + qn > M->n; n--) + { + ASSERT (n > 0); + if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0) + break; + } + + ASSERT (qn + n <= M->alloc); + + for (row = 0; row < 2; row++) + { + if (qn <= n) + mpn_mul (tp, M->p[row][1-col], n, qp, qn); + else + mpn_mul (tp, qp, qn, M->p[row][1-col], n); + + ASSERT (n + qn >= M->n); + c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n); + } + + n += qn; + + if (c[0] | c[1]) + { + M->p[0][col][n] = c[0]; + M->p[1][col][n] = c[1]; + n++; + } + else + { + n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0; + ASSERT (n >= M->n); + } + M->n = n; + } + + ASSERT (M->n < M->alloc); +} + +/* Multiply M by M1 from the right. Since the M1 elements fit in + GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs + temporary space M->n */ +void +mpn_hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1, + mp_ptr tp) +{ + mp_size_t n0, n1; + + /* Could avoid copy by some swapping of pointers. */ + MPN_COPY (tp, M->p[0][0], M->n); + n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n); + MPN_COPY (tp, M->p[1][0], M->n); + n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n); + + /* Depends on zero initialization */ + M->n = MAX(n0, n1); + ASSERT (M->n < M->alloc); +} + +/* Multiply M by M1 from the right. Needs 3*(M->n + M1->n) + 5 limbs + of temporary storage (see mpn_matrix22_mul_itch). */ +void +mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1, + mp_ptr tp) +{ + mp_size_t n; + + /* About the new size of M:s elements. Since M1's diagonal elements + are > 0, no element can decrease. The new elements are of size + M->n + M1->n, one limb more or less. The computation of the + matrix product produces elements of size M->n + M1->n + 1. But + the true size, after normalization, may be three limbs smaller. + + The reason that the product has normalized size >= M->n + M1->n - + 2 is subtle. It depends on the fact that M and M1 can be factored + as products of (1,1; 0,1) and (1,0; 1,1), and that we can't have + M ending with a large power and M1 starting with a large power of + the same matrix. */ + + /* FIXME: Strassen multiplication gives only a small speedup. In FFT + multiplication range, this function could be sped up quite a lot + using invariance. */ + ASSERT (M->n + M1->n < M->alloc); + + ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1] + | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0); + + ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1] + | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0); + + mpn_matrix22_mul (M->p[0][0], M->p[0][1], + M->p[1][0], M->p[1][1], M->n, + M1->p[0][0], M1->p[0][1], + M1->p[1][0], M1->p[1][1], M1->n, tp); + + /* Index of last potentially non-zero limb, size is one greater. */ + n = M->n + M1->n; + + n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); + n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); + n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); + + ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0); + + M->n = n + 1; +} + +/* Multiplies the least significant p limbs of (a;b) by M^-1. + Temporary space needed: 2 * (p + M->n)*/ +mp_size_t +mpn_hgcd_matrix_adjust (const struct hgcd_matrix *M, + mp_size_t n, mp_ptr ap, mp_ptr bp, + mp_size_t p, mp_ptr tp) +{ + /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b) + = (r11 a - r01 b; - r10 a + r00 b */ + + mp_ptr t0 = tp; + mp_ptr t1 = tp + p + M->n; + mp_limb_t ah, bh; + mp_limb_t cy; + + ASSERT (p + M->n < n); + + /* First compute the two values depending on a, before overwriting a */ + + if (M->n >= p) + { + mpn_mul (t0, M->p[1][1], M->n, ap, p); + mpn_mul (t1, M->p[1][0], M->n, ap, p); + } + else + { + mpn_mul (t0, ap, p, M->p[1][1], M->n); + mpn_mul (t1, ap, p, M->p[1][0], M->n); + } + + /* Update a */ + MPN_COPY (ap, t0, p); + ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n); + + if (M->n >= p) + mpn_mul (t0, M->p[0][1], M->n, bp, p); + else + mpn_mul (t0, bp, p, M->p[0][1], M->n); + + cy = mpn_sub (ap, ap, n, t0, p + M->n); + ASSERT (cy <= ah); + ah -= cy; + + /* Update b */ + if (M->n >= p) + mpn_mul (t0, M->p[0][0], M->n, bp, p); + else + mpn_mul (t0, bp, p, M->p[0][0], M->n); + + MPN_COPY (bp, t0, p); + bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n); + cy = mpn_sub (bp, bp, n, t1, p + M->n); + ASSERT (cy <= bh); + bh -= cy; + + if (ah > 0 || bh > 0) + { + ap[n] = ah; + bp[n] = bh; + n++; + } + else + { + /* The subtraction can reduce the size by at most one limb. */ + if (ap[n-1] == 0 && bp[n-1] == 0) + n--; + } + ASSERT (ap[n-1] > 0 || bp[n-1] > 0); + return n; +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_reduce.c b/gmp-6.3.0/mpn/generic/hgcd_reduce.c new file mode 100644 index 0000000..3aee77d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_reduce.c @@ -0,0 +1,242 @@ +/* hgcd_reduce.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Computes R -= A * B. Result must be non-negative. Normalized down + to size an, and resulting size is returned. */ +static mp_size_t +submul (mp_ptr rp, mp_size_t rn, + mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn) +{ + mp_ptr tp; + TMP_DECL; + + ASSERT (bn > 0); + ASSERT (an >= bn); + ASSERT (rn >= an); + ASSERT (an + bn <= rn + 1); + + TMP_MARK; + tp = TMP_ALLOC_LIMBS (an + bn); + + mpn_mul (tp, ap, an, bp, bn); + ASSERT ((an + bn <= rn) || (tp[rn] == 0)); + ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn - (an + bn > rn))); + TMP_FREE; + + while (rn > an && (rp[rn-1] == 0)) + rn--; + + return rn; +} + +/* Computes (a, b) <-- M^{-1} (a; b) */ +/* FIXME: + x Take scratch parameter, and figure out scratch need. + + x Use some fallback for small M->n? +*/ +static mp_size_t +hgcd_matrix_apply (const struct hgcd_matrix *M, + mp_ptr ap, mp_ptr bp, + mp_size_t n) +{ + mp_size_t an, bn, un, vn, nn; + mp_size_t mn[2][2]; + mp_size_t modn; + mp_ptr tp, sp, scratch; + mp_limb_t cy; + unsigned i, j; + + TMP_DECL; + + ASSERT ( (ap[n-1] | bp[n-1]) > 0); + + an = n; + MPN_NORMALIZE (ap, an); + bn = n; + MPN_NORMALIZE (bp, bn); + + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + { + mp_size_t k; + k = M->n; + MPN_NORMALIZE (M->p[i][j], k); + mn[i][j] = k; + } + + ASSERT (mn[0][0] > 0); + ASSERT (mn[1][1] > 0); + ASSERT ( (mn[0][1] | mn[1][0]) > 0); + + TMP_MARK; + + if (mn[0][1] == 0) + { + /* A unchanged, M = (1, 0; q, 1) */ + ASSERT (mn[0][0] == 1); + ASSERT (M->p[0][0][0] == 1); + ASSERT (mn[1][1] == 1); + ASSERT (M->p[1][1][0] == 1); + + /* Put B <-- B - q A */ + nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]); + } + else if (mn[1][0] == 0) + { + /* B unchanged, M = (1, q; 0, 1) */ + ASSERT (mn[0][0] == 1); + ASSERT (M->p[0][0][0] == 1); + ASSERT (mn[1][1] == 1); + ASSERT (M->p[1][1][0] == 1); + + /* Put A <-- A - q * B */ + nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]); + } + else + { + /* A = m00 a + m01 b ==> a <= A / m00, b <= A / m01. + B = m10 a + m11 b ==> a <= B / m10, b <= B / m11. */ + un = MIN (an - mn[0][0], bn - mn[1][0]) + 1; + vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1; + + nn = MAX (un, vn); + /* In the range of interest, mulmod_bnm1 should always beat mullo. */ + modn = mpn_mulmod_bnm1_next_size (nn + 1); + + TMP_ALLOC_LIMBS_3 (tp, modn, + sp, modn, + scratch, mpn_mulmod_bnm1_itch (modn, modn, M->n)); + + ASSERT (n <= 2*modn); + + if (n > modn) + { + cy = mpn_add (ap, ap, modn, ap + modn, n - modn); + MPN_INCR_U (ap, modn, cy); + + cy = mpn_add (bp, bp, modn, bp + modn, n - modn); + MPN_INCR_U (bp, modn, cy); + + n = modn; + } + + mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch); + mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch); + + /* FIXME: Handle the small n case in some better way. */ + if (n + mn[1][1] < modn) + MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]); + if (n + mn[0][1] < modn) + MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]); + + cy = mpn_sub_n (tp, tp, sp, modn); + MPN_DECR_U (tp, modn, cy); + + ASSERT (mpn_zero_p (tp + nn, modn - nn)); + + mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch); + MPN_COPY (ap, tp, nn); + mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch); + + if (n + mn[1][0] < modn) + MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]); + if (n + mn[0][0] < modn) + MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]); + + cy = mpn_sub_n (tp, tp, sp, modn); + MPN_DECR_U (tp, modn, cy); + + ASSERT (mpn_zero_p (tp + nn, modn - nn)); + MPN_COPY (bp, tp, nn); + + while ( (ap[nn-1] | bp[nn-1]) == 0) + { + nn--; + ASSERT (nn > 0); + } + } + TMP_FREE; + + return nn; +} + +mp_size_t +mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p) +{ + mp_size_t itch; + if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD)) + { + itch = mpn_hgcd_itch (n-p); + + /* For arbitrary p, the storage for _adjust is 2*(p + M->n) = 2 * + (p + ceil((n-p)/2) - 1 <= n + p - 1 */ + if (itch < n + p - 1) + itch = n + p - 1; + } + else + { + itch = 2*(n-p) + mpn_hgcd_itch (n-p); + /* Currently, hgcd_matrix_apply allocates its own storage. */ + } + return itch; +} + +/* FIXME: Document storage need. */ +mp_size_t +mpn_hgcd_reduce (struct hgcd_matrix *M, + mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t p, + mp_ptr tp) +{ + mp_size_t nn; + if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD)) + { + nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp); + if (nn > 0) + /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) + = 2 (n - 1) */ + return mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp); + } + else + { + MPN_COPY (tp, ap + p, n - p); + MPN_COPY (tp + n - p, bp + p, n - p); + if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p))) + return hgcd_matrix_apply (M, ap, bp, n); + } + return 0; +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_step.c b/gmp-6.3.0/mpn/generic/hgcd_step.c new file mode 100644 index 0000000..a978a88 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_step.c @@ -0,0 +1,127 @@ +/* hgcd_step.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +static void +hgcd_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + ASSERT (!gp); + ASSERT (d >= 0); + ASSERT (d <= 1); + + MPN_NORMALIZE (qp, qn); + if (qn > 0) + { + struct hgcd_matrix *M = (struct hgcd_matrix *) p; + /* NOTES: This is a bit ugly. A tp area is passed to + gcd_subdiv_step, which stores q at the start of that area. We + now use the rest. */ + mp_ptr tp = (mp_ptr) qp + qn; + mpn_hgcd_matrix_update_q (M, qp, qn, d, tp); + } +} + +/* Perform a few steps, using some of mpn_hgcd2, subtraction and + division. Reduces the size by almost one limb or more, but never + below the given size s. Return new size for a and b, or 0 if no + more steps are possible. + + If hgcd2 succeeds, needs temporary space for hgcd_matrix_mul_1, M->n + limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2 + fails, needs space for the quotient, qn <= n - s limbs, for and + hgcd_matrix_update_q, qn + (size of the appropriate column of M) <= + (resulting size of M) + 1. + + If N is the input size to the calling hgcd, then s = floor(N/2) + + 1, M->n < N, qn + product size <= n - s + n - s + 1 = 2 (n - s) + 1 + <= N. +*/ + +mp_size_t +mpn_hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, + struct hgcd_matrix *M, mp_ptr tp) +{ + struct hgcd_matrix1 M1; + mp_limb_t mask; + mp_limb_t ah, al, bh, bl; + + ASSERT (n > s); + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (n == s + 1) + { + if (mask < 4) + goto subtract; + + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_hgcd2 step */ + if (mpn_hgcd2 (ah, al, bh, bl, &M1)) + { + /* Multiply M <- M * M1 */ + mpn_hgcd_matrix_mul_1 (M, &M1, tp); + + /* Can't swap inputs, so we need to copy. */ + MPN_COPY (tp, ap, n); + /* Multiply M1^{-1} (a;b) */ + return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n); + } + + subtract: + + return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_hook, M, tp); +} diff --git a/gmp-6.3.0/mpn/generic/invert.c b/gmp-6.3.0/mpn/generic/invert.c new file mode 100644 index 0000000..157ff2b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/invert.c @@ -0,0 +1,86 @@ +/* invert.c -- Compute floor((B^{2n}-1)/U) - B^n. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright (C) 2007, 2009, 2010, 2012, 2014-2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) +{ + ASSERT (n > 0); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); + + if (n == 1) + invert_limb (*ip, *dp); + else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) + { + /* Maximum scratch needed by this branch: 2*n */ + mp_ptr xp; + + xp = scratch; /* 2 * n limbs */ + /* n > 1 here */ + MPN_FILL (xp, n, GMP_NUMB_MAX); + mpn_com (xp + n, dp, n); + if (n == 2) { + mpn_divrem_2 (ip, 0, xp, 4, dp); + } else { + gmp_pi1_t inv; + invert_pi1 (inv, dp[n-1], dp[n-2]); + /* FIXME: should we use dcpi1_div_q, for big sizes? */ + mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); + } + } + else { /* Use approximated inverse; correct the result if needed. */ + mp_limb_t e; /* The possible error in the approximate inverse */ + + ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); + e = mpn_ni_invertappr (ip, dp, n, scratch); + + if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ + /* Code to detect and correct the "off by one" approximation. */ + mpn_mul_n (scratch, ip, dp, n); + e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/ + if (LIKELY(e)) /* The high part can not give a carry by itself. */ + e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */ + /* If the value was wrong (no carry), correct it (increment). */ + e ^= CNST_LIMB (1); + MPN_INCR_U (ip, n, e); + } + } +} diff --git a/gmp-6.3.0/mpn/generic/invertappr.c b/gmp-6.3.0/mpn/generic/invertappr.c new file mode 100644 index 0000000..3be5596 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/invertappr.c @@ -0,0 +1,300 @@ +/* mpn_invertappr and helper functions. Compute I such that + floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U. + + Contributed to the GNU project by Marco Bodrato. + + The algorithm used here was inspired by ApproximateReciprocal from "Modern + Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special + thanks to Paul Zimmermann for his very valuable suggestions on all the + theoretical aspects during the work on this code. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* FIXME: The iterative version splits the operand in two slightly unbalanced + parts, the use of log_2 (or counting the bits) underestimate the maximum + number of iterations. */ + +#if TUNE_PROGRAM_BUILD +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t))) +#define MAYBE_dcpi1_divappr 1 +#else +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD)) +#define MAYBE_dcpi1_divappr \ + (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD) +#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \ + (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) +#undef INV_MULMOD_BNM1_THRESHOLD +#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */ +#endif +#endif + +/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take + the strictly normalised value {dp,n} (i.e., most significant bit must be set) + as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}. + + Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the + following conditions are satisfied by the output: + 0 <= e <= 1; + {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) . + I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert. + e=1 means that the result _may_ be one less than expected. + + The _bc version returns e=1 most of the time. + The _ni version should return e=0 most of the time; only about 1% of + possible random input should give e=1. + + When the strict result is needed, i.e., e=0 in the relation above: + {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ; + the function mpn_invert (ip, dp, n, scratch) should be used instead. */ + +/* Maximum scratch needed by this branch (at xp): 2*n */ +static mp_limb_t +mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp) +{ + ASSERT (n > 0); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n))); + + /* Compute a base value of r limbs. */ + if (n == 1) + invert_limb (*ip, *dp); + else { + /* n > 1 here */ + MPN_FILL (xp, n, GMP_NUMB_MAX); + mpn_com (xp + n, dp, n); + + /* Now xp contains B^2n - {dp,n}*B^n - 1 */ + + /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */ + if (n == 2) { + mpn_divrem_2 (ip, 0, xp, 4, dp); + } else { + gmp_pi1_t inv; + invert_pi1 (inv, dp[n-1], dp[n-2]); + if (! MAYBE_dcpi1_divappr + || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD)) + mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32); + else + mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv); + MPN_DECR_U(ip, n, CNST_LIMB (1)); + return 1; + } + } + return 0; +} + +/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's + iterations (at least one). + + Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer + Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121 + in version 0.4 of the book. + + Some adaptations were introduced, to allow product mod B^m-1 and return the + value e. + + We introduced a correction in such a way that "the value of + B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads + "2B^n-1"). + + Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn + in the scratch, i.e. 3*rn <= 2*n: we require n>4. + + We use a wrapped product modulo B^m-1. NOTE: is there any normalisation + problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| < + B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can + happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h = + B^{n+h} - A, then we get into the "negative" branch, where X_h is not + incremented (because A < B^n). + + FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it + is allocated apart. + */ + +mp_limb_t +mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) +{ + mp_limb_t cy; + mp_size_t rn, mn; + mp_size_t sizes[NPOWS], *sizp; + mp_ptr tp; + TMP_DECL; +#define xp scratch + + ASSERT (n > 4); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); + + /* Compute the computation precisions from highest to lowest, leaving the + base case size in 'rn'. */ + sizp = sizes; + rn = n; + do { + *sizp = rn; + rn = (rn >> 1) + 1; + ++sizp; + } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD)); + + /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */ + dp += n; + ip += n; + + /* Compute a base value of rn limbs. */ + mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch); + + TMP_MARK; + + if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)) + { + mn = mpn_mulmod_bnm1_next_size (n + 1); + tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1)); + } + /* Use Newton's iterations to get the desired precision.*/ + + while (1) { + n = *--sizp; + /* + v n v + +----+--+ + ^ rn ^ + */ + + /* Compute i_jd . */ + if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD) + || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) { + /* FIXME: We do only need {xp,n+1}*/ + mpn_mul (xp, dp - n, n, ip - rn, rn); + mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1); + cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */ + /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */ + } else { /* Use B^mn-1 wraparound */ + mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp); + /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */ + /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */ + /* Add dp*B^rn mod (B^mn-1) */ + ASSERT (n >= mn - rn); + cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn); + cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy); + /* Subtract B^{rn+n}, maybe only compensate the carry*/ + xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */ + MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy); + MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */ + cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */ + } + + if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */ + cy = xp[n]; /* 0 <= cy <= 1 here. */ +#if HAVE_NATIVE_mpn_sublsh1_n + if (cy++) { + if (mpn_cmp (xp, dp - n, n) > 0) { + mp_limb_t chk; + chk = mpn_sublsh1_n (xp, xp, dp - n, n); + ASSERT (chk == xp[n]); + ++ cy; + } else + ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n)); + } +#else /* no mpn_sublsh1_n*/ + if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) { + ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n)); + ++cy; + } +#endif + /* 1 <= cy <= 3 here. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + if (mpn_cmp (xp, dp - n, n) > 0) { + ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n)); + ++cy; + } else + ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); +#else /* no mpn_rsblsh1_n*/ + if (mpn_cmp (xp, dp - n, n) > 0) { + ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n)); + ++cy; + } + ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); +#endif + MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */ + } else { /* "negative" residue class */ + ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1)); + MPN_DECR_U(xp, n + 1, cy); + if (xp[n] != GMP_NUMB_MAX) { + MPN_INCR_U(ip - rn, rn, CNST_LIMB (1)); + ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n)); + } + mpn_com (xp + 2 * n - rn, xp + n - rn, rn); + } + + /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */ + mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn); + cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n); + cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy); + MPN_INCR_U (ip - rn, rn, cy); + if (sizp == sizes) { /* Get out of the cycle */ + /* Check for possible carry propagation from below. */ + cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */ + /* cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */ + break; + } + rn = n; + } + TMP_FREE; + + return cy; +#undef xp +} + +mp_limb_t +mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) +{ + ASSERT (n > 0); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); + + if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD)) + return mpn_bc_invertappr (ip, dp, n, scratch); + else + return mpn_ni_invertappr (ip, dp, n, scratch); +} diff --git a/gmp-6.3.0/mpn/generic/jacbase.c b/gmp-6.3.0/mpn/generic/jacbase.c new file mode 100644 index 0000000..391ceac --- /dev/null +++ b/gmp-6.3.0/mpn/generic/jacbase.c @@ -0,0 +1,242 @@ +/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments. + + THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO + INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. + +Copyright 1999-2002, 2010, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Use the simple loop by default. The generic count_trailing_zeros is not + very fast, and the extra trickery of method 3 has proven to be less use + than might have been though. */ +#ifndef JACOBI_BASE_METHOD +#define JACOBI_BASE_METHOD 2 +#endif + + +/* Use count_trailing_zeros. */ +#if JACOBI_BASE_METHOD == 1 +#define PROCESS_TWOS_ANY \ + { \ + mp_limb_t twos; \ + count_trailing_zeros (twos, a); \ + result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b); \ + a >>= twos; \ + } +#define PROCESS_TWOS_EVEN PROCESS_TWOS_ANY +#endif + +/* Use a simple loop. A disadvantage of this is that there's a branch on a + 50/50 chance of a 0 or 1 low bit. */ +#if JACOBI_BASE_METHOD == 2 +#define PROCESS_TWOS_EVEN \ + { \ + int two; \ + two = JACOBI_TWO_U_BIT1 (b); \ + do \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + while ((a & 1) == 0); \ + } +#define PROCESS_TWOS_ANY \ + if ((a & 1) == 0) \ + PROCESS_TWOS_EVEN; +#endif + +/* Process one bit arithmetically, then a simple loop. This cuts the loop + condition down to a 25/75 chance, which should branch predict better. + The CPU will need a reasonable variable left shift. */ +#if JACOBI_BASE_METHOD == 3 +#define PROCESS_TWOS_EVEN \ + { \ + int two, mask, shift; \ + \ + two = JACOBI_TWO_U_BIT1 (b); \ + mask = (~a & 2); \ + a >>= 1; \ + \ + shift = (~a & 1); \ + a >>= shift; \ + result_bit1 ^= two ^ (two & mask); \ + \ + while ((a & 1) == 0) \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + } +#define PROCESS_TWOS_ANY \ + { \ + int two, mask, shift; \ + \ + two = JACOBI_TWO_U_BIT1 (b); \ + shift = (~a & 1); \ + a >>= shift; \ + \ + mask = shift << 1; \ + result_bit1 ^= (two & mask); \ + \ + while ((a & 1) == 0) \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + } +#endif + +#if JACOBI_BASE_METHOD < 4 +/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but + with a restricted range of inputs accepted, namely b>1, b odd. + + The initial result_bit1 is taken as a parameter for the convenience of + mpz_kronecker_ui() et al. The sign changes both here and in those + routines accumulate nicely in bit 1, see the JACOBI macros. + + The return value here is the normal +1, 0, or -1. Note that +1 and -1 + have bit 1 in the "BIT1" sense, which could be useful if the caller is + accumulating it into some extended calculation. + + Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be + possible, but a couple of tests suggest it's not a significant speedup, + and may even be a slowdown, so what's here is good enough for now. */ + +int +mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1) +{ + ASSERT (b & 1); /* b odd */ + ASSERT (b != 1); + + if (a == 0) + return 0; + + PROCESS_TWOS_ANY; + if (a == 1) + goto done; + + if (a >= b) + goto a_gt_b; + + for (;;) + { + result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b); + MP_LIMB_T_SWAP (a, b); + + a_gt_b: + do + { + /* working on (a/b), a,b odd, a>=b */ + ASSERT (a & 1); + ASSERT (b & 1); + ASSERT (a >= b); + + if ((a -= b) == 0) + return 0; + + PROCESS_TWOS_EVEN; + if (a == 1) + goto done; + } + while (a >= b); + } + + done: + return JACOBI_BIT1_TO_PN (result_bit1); +} +#endif + +#if JACOBI_BASE_METHOD == 4 +/* Computes (a/b) for odd b > 1 and any a. The initial bit is taken as a + * parameter. We have no need for the convention that the sign is in + * bit 1, internally we use bit 0. */ + +/* FIXME: Could try table-based count_trailing_zeros. */ +int +mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int bit) +{ + int c; + + ASSERT (b & 1); + ASSERT (b > 1); + + if (a == 0) + /* This is the only line which depends on b > 1 */ + return 0; + + bit >>= 1; + + /* Below, we represent a and b shifted right so that the least + significant one bit is implicit. */ + + b >>= 1; + + count_trailing_zeros (c, a); + bit ^= c & (b ^ (b >> 1)); + + /* We may have c==GMP_LIMB_BITS-1, so we can't use a>>c+1. */ + a >>= c; + a >>= 1; + + do + { + mp_limb_t t = a - b; + mp_limb_t bgta = LIMB_HIGHBIT_TO_MASK (t); + + if (t == 0) + return 0; + + /* If b > a, invoke reciprocity */ + bit ^= (bgta & a & b); + + /* b <-- min (a, b) */ + b += (bgta & t); + + /* a <-- |a - b| */ + a = (t ^ bgta) - bgta; + + /* Number of trailing zeros is the same no matter if we look at + * t or a, but using t gives more parallelism. */ + count_trailing_zeros (c, t); + c ++; + /* (2/b) = -1 if b = 3 or 5 mod 8 */ + bit ^= c & (b ^ (b >> 1)); + a >>= c; + } + while (a > 0); + + return 1-2*(bit & 1); +} +#endif /* JACOBI_BASE_METHOD == 4 */ diff --git a/gmp-6.3.0/mpn/generic/jacobi.c b/gmp-6.3.0/mpn/generic/jacobi.c new file mode 100644 index 0000000..d98b126 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/jacobi.c @@ -0,0 +1,294 @@ +/* jacobi.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2010, 2011 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef JACOBI_DC_THRESHOLD +#define JACOBI_DC_THRESHOLD GCD_DC_THRESHOLD +#endif + +/* Schönhage's rules: + * + * Assume r0 = r1 q1 + r2, with r0 odd, and r1 = q2 r2 + r3 + * + * If r1 is odd, then + * + * (r1 | r0) = s(r1, r0) (r0 | r1) = s(r1, r0) (r2, r1) + * + * where s(x,y) = (-1)^{(x-1)(y-1)/4} = (-1)^[x = y = 3 (mod 4)]. + * + * If r1 is even, r2 must be odd. We have + * + * (r1 | r0) = (r1 - r0 | r0) = (-1)^(r0-1)/2 (r0 - r1 | r0) + * = (-1)^(r0-1)/2 s(r0, r0 - r1) (r0 | r0 - r1) + * = (-1)^(r0-1)/2 s(r0, r0 - r1) (r1 | r0 - r1) + * + * Now, if r1 = 0 (mod 4), then the sign factor is +1, and repeating + * q1 times gives + * + * (r1 | r0) = (r1 | r2) = (r3 | r2) + * + * On the other hand, if r1 = 2 (mod 4), the sign factor is + * (-1)^{(r0-1)/2}, and repeating q1 times gives the exponent + * + * (r0-1)/2 + (r0-r1-1)/2 + ... + (r0 - (q1-1) r1)/2 + * = q1 (r0-1)/2 + q1 (q1-1)/2 + * + * and we can summarize the even case as + * + * (r1 | r0) = t(r1, r0, q1) (r3 | r2) + * + * where t(x,y,q) = (-1)^{[x = 2 (mod 4)] (q(y-1)/2 + y(q-1)/2)} + * + * What about termination? The remainder sequence ends with (0|1) = 1 + * (or (0 | r) = 0 if r != 1). What are the possible cases? If r1 is + * odd, r2 may be zero. If r1 is even, then r2 = r0 - q1 r1 is odd and + * hence non-zero. We may have r3 = r1 - q2 r2 = 0. + * + * Examples: (11|15) = - (15|11) = - (4|11) + * (4|11) = (4| 3) = (1| 3) + * (1| 3) = (3|1) = (0|1) = 1 + * + * (2|7) = (2|1) = (0|1) = 1 + * + * Detail: (2|7) = (2-7|7) = (-1|7)(5|7) = -(7|5) = -(2|5) + * (2|5) = (2-5|5) = (-1|5)(3|5) = (5|3) = (2|3) + * (2|3) = (2-3|3) = (-1|3)(1|3) = -(3|1) = -(2|1) + * + */ + +/* In principle, the state consists of four variables: e (one bit), a, + b (two bits each), d (one bit). Collected factors are (-1)^e. a and + b are the least significant bits of the current remainders. d + (denominator) is 0 if we're currently subtracting multiplies of a + from b, and 1 if we're subtracting b from a. + + e is stored in the least significant bit, while a, b and d are + coded as only 13 distinct values in bits 1-4, according to the + following table. For rows not mentioning d, the value is either + implied, or it doesn't matter. */ + +#if WANT_ASSERT +static const struct +{ + unsigned char a; + unsigned char b; +} decode_table[13] = { + /* 0 */ { 0, 1 }, + /* 1 */ { 0, 3 }, + /* 2 */ { 1, 1 }, + /* 3 */ { 1, 3 }, + /* 4 */ { 2, 1 }, + /* 5 */ { 2, 3 }, + /* 6 */ { 3, 1 }, + /* 7 */ { 3, 3 }, /* d = 1 */ + /* 8 */ { 1, 0 }, + /* 9 */ { 1, 2 }, + /* 10 */ { 3, 0 }, + /* 11 */ { 3, 2 }, + /* 12 */ { 3, 3 }, /* d = 0 */ +}; +#define JACOBI_A(bits) (decode_table[(bits)>>1].a) +#define JACOBI_B(bits) (decode_table[(bits)>>1].b) +#endif /* WANT_ASSERT */ + +const unsigned char jacobi_table[208] = { +#include "jacobitab.h" +}; + +#define BITS_FAIL 31 + +static void +jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + unsigned *bitsp = (unsigned *) p; + + if (gp) + { + ASSERT (gn > 0); + if (gn != 1 || gp[0] != 1) + { + *bitsp = BITS_FAIL; + return; + } + } + + if (qp) + { + ASSERT (qn > 0); + ASSERT (d >= 0); + *bitsp = mpn_jacobi_update (*bitsp, d, qp[0] & 3); + } +} + +#define CHOOSE_P(n) (2*(n) / 3) + +int +mpn_jacobi_n (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned bits) +{ + mp_size_t scratch; + mp_size_t matrix_scratch; + mp_ptr tp; + + TMP_DECL; + + ASSERT (n > 0); + ASSERT ( (ap[n-1] | bp[n-1]) > 0); + ASSERT ( (bp[0] | ap[0]) & 1); + + /* FIXME: Check for small sizes first, before setting up temporary + storage etc. */ + scratch = MPN_GCD_SUBDIV_STEP_ITCH(n); + + if (ABOVE_THRESHOLD (n, JACOBI_DC_THRESHOLD)) + { + mp_size_t hgcd_scratch; + mp_size_t update_scratch; + mp_size_t p = CHOOSE_P (n); + mp_size_t dc_scratch; + + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + hgcd_scratch = mpn_hgcd_itch (n - p); + update_scratch = p + n - 1; + + dc_scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); + if (dc_scratch > scratch) + scratch = dc_scratch; + } + + TMP_MARK; + tp = TMP_ALLOC_LIMBS(scratch); + + while (ABOVE_THRESHOLD (n, JACOBI_DC_THRESHOLD)) + { + struct hgcd_matrix M; + mp_size_t p = 2*n/3; + mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + mp_size_t nn; + mpn_hgcd_matrix_init (&M, n - p, tp); + + nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M, &bits, + tp + matrix_scratch); + if (nn > 0) + { + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + /* Temporary storage 2 (p + M->n) <= p + n - 1. */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); + } + else + { + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, jacobi_hook, &bits, tp); + if (!n) + { + TMP_FREE; + return bits == BITS_FAIL ? 0 : mpn_jacobi_finish (bits); + } + } + } + + while (n > 2) + { + struct hgcd_matrix1 M; + mp_limb_t ah, al, bh, bl; + mp_limb_t mask; + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_nhgcd2 step */ + if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M, &bits)) + { + n = mpn_matrix22_mul1_inverse_vector (&M, tp, ap, bp, n); + MP_PTR_SWAP (ap, tp); + } + else + { + /* mpn_hgcd2 has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, &jacobi_hook, &bits, tp); + if (!n) + { + TMP_FREE; + return bits == BITS_FAIL ? 0 : mpn_jacobi_finish (bits); + } + } + } + + if (bits >= 16) + MP_PTR_SWAP (ap, bp); + + ASSERT (bp[0] & 1); + + if (n == 1) + { + mp_limb_t al, bl; + al = ap[0]; + bl = bp[0]; + + TMP_FREE; + if (bl == 1) + return 1 - 2*(bits & 1); + else + return mpn_jacobi_base (al, bl, bits << 1); + } + + else + { + int res = mpn_jacobi_2 (ap, bp, bits & 1); + TMP_FREE; + return res; + } +} diff --git a/gmp-6.3.0/mpn/generic/jacobi_2.c b/gmp-6.3.0/mpn/generic/jacobi_2.c new file mode 100644 index 0000000..028b8a4 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/jacobi_2.c @@ -0,0 +1,351 @@ +/* jacobi_2.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef JACOBI_2_METHOD +#define JACOBI_2_METHOD 2 +#endif + +/* Computes (a / b) where b is odd, and a and b are otherwise arbitrary + two-limb numbers. */ +#if JACOBI_2_METHOD == 1 +int +mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit) +{ + mp_limb_t ah, al, bh, bl; + int c; + + al = ap[0]; + ah = ap[1]; + bl = bp[0]; + bh = bp[1]; + + ASSERT (bl & 1); + + bl = ((bh << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK) | (bl >> 1); + bh >>= 1; + + if ( (bh | bl) == 0) + return 1 - 2*(bit & 1); + + if ( (ah | al) == 0) + return 0; + + if (al == 0) + { + al = ah; + ah = 0; + bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1)); + } + count_trailing_zeros (c, al); + bit ^= c & (bl ^ (bl >> 1)); + + c++; + if (UNLIKELY (c == GMP_NUMB_BITS)) + { + al = ah; + ah = 0; + } + else + { + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + } + while ( (ah | bh) > 0) + { + mp_limb_t th, tl; + mp_limb_t bgta; + + sub_ddmmss (th, tl, ah, al, bh, bl); + if ( (tl | th) == 0) + return 0; + + bgta = LIMB_HIGHBIT_TO_MASK (th); + + /* If b > a, invoke reciprocity */ + bit ^= (bgta & al & bl); + + /* b <-- min (a, b) */ + add_ssaaaa (bh, bl, bh, bl, th & bgta, tl & bgta); + + if ( (bh | bl) == 0) + return 1 - 2*(bit & 1); + + /* a <-- |a - b| */ + al = (bgta ^ tl) - bgta; + ah = (bgta ^ th); + + if (UNLIKELY (al == 0)) + { + /* If b > a, al == 0 implies that we have a carry to + propagate. */ + al = ah - bgta; + ah = 0; + bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1)); + } + count_trailing_zeros (c, al); + c++; + bit ^= c & (bl ^ (bl >> 1)); + + if (UNLIKELY (c == GMP_NUMB_BITS)) + { + al = ah; + ah = 0; + } + else + { + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + } + } + + ASSERT (bl > 0); + + while ( (al | bl) & GMP_LIMB_HIGHBIT) + { + /* Need an extra comparison to get the mask. */ + mp_limb_t t = al - bl; + mp_limb_t bgta = - (bl > al); + + if (t == 0) + return 0; + + /* If b > a, invoke reciprocity */ + bit ^= (bgta & al & bl); + + /* b <-- min (a, b) */ + bl += (bgta & t); + + /* a <-- |a - b| */ + al = (t ^ bgta) - bgta; + + /* Number of trailing zeros is the same no matter if we look at + * t or a, but using t gives more parallelism. */ + count_trailing_zeros (c, t); + c ++; + /* (2/b) = -1 if b = 3 or 5 mod 8 */ + bit ^= c & (bl ^ (bl >> 1)); + + if (UNLIKELY (c == GMP_NUMB_BITS)) + return 1 - 2*(bit & 1); + + al >>= c; + } + + /* Here we have a little impedance mismatch. Better to inline it? */ + return mpn_jacobi_base (2*al+1, 2*bl+1, bit << 1); +} +#elif JACOBI_2_METHOD == 2 +int +mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit) +{ + mp_limb_t ah, al, bh, bl; + int c; + + al = ap[0]; + ah = ap[1]; + bl = bp[0]; + bh = bp[1]; + + ASSERT (bl & 1); + + /* Use bit 1. */ + bit <<= 1; + + if (bh == 0 && bl == 1) + /* (a|1) = 1 */ + return 1 - (bit & 2); + + if (al == 0) + { + if (ah == 0) + /* (0|b) = 0, b > 1 */ + return 0; + + count_trailing_zeros (c, ah); + bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1)); + + al = bl; + bl = ah >> c; + + if (bl == 1) + /* (1|b) = 1 */ + return 1 - (bit & 2); + + ah = bh; + + bit ^= al & bl; + + goto b_reduced; + } + if ( (al & 1) == 0) + { + count_trailing_zeros (c, al); + + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + bit ^= (c << 1) & (bl ^ (bl >> 1)); + } + if (ah == 0) + { + if (bh > 0) + { + bit ^= al & bl; + MP_LIMB_T_SWAP (al, bl); + ah = bh; + goto b_reduced; + } + goto ab_reduced; + } + + while (bh > 0) + { + /* Compute (a|b) */ + while (ah > bh) + { + sub_ddmmss (ah, al, ah, al, bh, bl); + if (al == 0) + { + count_trailing_zeros (c, ah); + bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1)); + + al = bl; + bl = ah >> c; + ah = bh; + + bit ^= al & bl; + goto b_reduced; + } + count_trailing_zeros (c, al); + bit ^= (c << 1) & (bl ^ (bl >> 1)); + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + } + if (ah == bh) + goto cancel_hi; + + if (ah == 0) + { + bit ^= al & bl; + MP_LIMB_T_SWAP (al, bl); + ah = bh; + break; + } + + bit ^= al & bl; + + /* Compute (b|a) */ + while (bh > ah) + { + sub_ddmmss (bh, bl, bh, bl, ah, al); + if (bl == 0) + { + count_trailing_zeros (c, bh); + bit ^= ((GMP_NUMB_BITS + c) << 1) & (al ^ (al >> 1)); + + bl = bh >> c; + bit ^= al & bl; + goto b_reduced; + } + count_trailing_zeros (c, bl); + bit ^= (c << 1) & (al ^ (al >> 1)); + bl = ((bh << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (bl >> c); + bh >>= c; + } + bit ^= al & bl; + + /* Compute (a|b) */ + if (ah == bh) + { + cancel_hi: + if (al < bl) + { + MP_LIMB_T_SWAP (al, bl); + bit ^= al & bl; + } + al -= bl; + if (al == 0) + return 0; + + count_trailing_zeros (c, al); + bit ^= (c << 1) & (bl ^ (bl >> 1)); + al >>= c; + + if (al == 1) + return 1 - (bit & 2); + + MP_LIMB_T_SWAP (al, bl); + bit ^= al & bl; + break; + } + } + + b_reduced: + /* Compute (a|b), with b a single limb. */ + ASSERT (bl & 1); + + if (bl == 1) + /* (a|1) = 1 */ + return 1 - (bit & 2); + + while (ah > 0) + { + ah -= (al < bl); + al -= bl; + if (al == 0) + { + if (ah == 0) + return 0; + count_trailing_zeros (c, ah); + bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1)); + al = ah >> c; + goto ab_reduced; + } + count_trailing_zeros (c, al); + + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + bit ^= (c << 1) & (bl ^ (bl >> 1)); + } + ab_reduced: + ASSERT (bl & 1); + ASSERT (bl > 1); + + return mpn_jacobi_base (al, bl, bit); +} +#else +#error Unsupported value for JACOBI_2_METHOD +#endif diff --git a/gmp-6.3.0/mpn/generic/logops_n.c b/gmp-6.3.0/mpn/generic/logops_n.c new file mode 100644 index 0000000..3adba2c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/logops_n.c @@ -0,0 +1,77 @@ +/* mpn_and_n, mpn_ior_n, etc -- mpn logical operations. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#ifdef OPERATION_and_n +#define func __MPN(and_n) +#define call mpn_and_n +#endif + +#ifdef OPERATION_andn_n +#define func __MPN(andn_n) +#define call mpn_andn_n +#endif + +#ifdef OPERATION_nand_n +#define func __MPN(nand_n) +#define call mpn_nand_n +#endif + +#ifdef OPERATION_ior_n +#define func __MPN(ior_n) +#define call mpn_ior_n +#endif + +#ifdef OPERATION_iorn_n +#define func __MPN(iorn_n) +#define call mpn_iorn_n +#endif + +#ifdef OPERATION_nior_n +#define func __MPN(nior_n) +#define call mpn_nior_n +#endif + +#ifdef OPERATION_xor_n +#define func __MPN(xor_n) +#define call mpn_xor_n +#endif + +#ifdef OPERATION_xnor_n +#define func __MPN(xnor_n) +#define call mpn_xnor_n +#endif + +void +func (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + call (rp, up, vp, n); +} diff --git a/gmp-6.3.0/mpn/generic/lshift.c b/gmp-6.3.0/mpn/generic/lshift.c new file mode 100644 index 0000000..7e1fdef --- /dev/null +++ b/gmp-6.3.0/mpn/generic/lshift.c @@ -0,0 +1,72 @@ +/* mpn_lshift -- Shift left low level. + +Copyright 1991, 1993, 1994, 1996, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Shift U (pointed to by up and n limbs long) cnt bits to the left + and store the n least significant limbs of the result at rp. + Return the bits shifted out from the most significant limb. + + Argument constraints: + 1. 0 < cnt < GMP_NUMB_BITS. + 2. If the result is to be written over the input, rp must be >= up. +*/ + +mp_limb_t +mpn_lshift (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + mp_limb_t high_limb, low_limb; + unsigned int tnc; + mp_size_t i; + mp_limb_t retval; + + ASSERT (n >= 1); + ASSERT (cnt >= 1); + ASSERT (cnt < GMP_NUMB_BITS); + ASSERT (MPN_SAME_OR_DECR_P (rp, up, n)); + + up += n; + rp += n; + + tnc = GMP_NUMB_BITS - cnt; + low_limb = *--up; + retval = low_limb >> tnc; + high_limb = (low_limb << cnt) & GMP_NUMB_MASK; + + for (i = n - 1; i != 0; i--) + { + low_limb = *--up; + *--rp = high_limb | (low_limb >> tnc); + high_limb = (low_limb << cnt) & GMP_NUMB_MASK; + } + *--rp = high_limb; + + return retval; +} diff --git a/gmp-6.3.0/mpn/generic/lshiftc.c b/gmp-6.3.0/mpn/generic/lshiftc.c new file mode 100644 index 0000000..a583602 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/lshiftc.c @@ -0,0 +1,73 @@ +/* mpn_lshiftc -- Shift left low level with complement. + +Copyright 1991, 1993, 1994, 1996, 2000-2002, 2009 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Shift U (pointed to by up and n limbs long) cnt bits to the left + and store the n least significant limbs of the result at rp. + Return the bits shifted out from the most significant limb. + + Argument constraints: + 1. 0 < cnt < GMP_NUMB_BITS. + 2. If the result is to be written over the input, rp must be >= up. +*/ + +mp_limb_t +mpn_lshiftc (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + mp_limb_t high_limb, low_limb; + unsigned int tnc; + mp_size_t i; + mp_limb_t retval; + + ASSERT (n >= 1); + ASSERT (cnt >= 1); + ASSERT (cnt < GMP_NUMB_BITS); + ASSERT (MPN_SAME_OR_DECR_P (rp, up, n)); + + up += n; + rp += n; + + tnc = GMP_NUMB_BITS - cnt; + low_limb = *--up; + retval = low_limb >> tnc; + high_limb = (low_limb << cnt); + + for (i = n - 1; i != 0; i--) + { + low_limb = *--up; + *--rp = (~(high_limb | (low_limb >> tnc))) & GMP_NUMB_MASK; + high_limb = low_limb << cnt; + } + *--rp = (~high_limb) & GMP_NUMB_MASK; + + return retval; +} diff --git a/gmp-6.3.0/mpn/generic/matrix22_mul.c b/gmp-6.3.0/mpn/generic/matrix22_mul.c new file mode 100644 index 0000000..6a1299a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/matrix22_mul.c @@ -0,0 +1,321 @@ +/* matrix22_mul.c. + + Contributed by Niels Möller and Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#define MUL(rp, ap, an, bp, bn) do { \ + if (an >= bn) \ + mpn_mul (rp, ap, an, bp, bn); \ + else \ + mpn_mul (rp, bp, bn, ap, an); \ +} while (0) + +/* Inputs are unsigned. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + int c; + MPN_CMP (c, ap, bp, n); + if (c >= 0) + { + mpn_sub_n (rp, ap, bp, n); + return 0; + } + else + { + mpn_sub_n (rp, bp, ap, n); + return 1; + } +} + +static int +add_signed_n (mp_ptr rp, + mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n) +{ + if (as != bs) + return as ^ abs_sub_n (rp, ap, bp, n); + else + { + ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n)); + return as; + } +} + +mp_size_t +mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn) +{ + if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) + || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) + return 3*rn + 2*mn; + else + return 3*(rn + mn) + 5; +} + +/* Algorithm: + + / s0 \ / 1 0 0 0 \ / r0 \ + | s1 | | 0 1 0 1 | | r1 | + | s2 | | 0 0 -1 1 | | r2 | + | s3 | = | 0 1 -1 1 | \ r3 / + | s4 | | -1 1 -1 1 | + | s5 | | 0 1 0 0 | + \ s6 / \ 0 0 1 0 / + + / t0 \ / 1 0 0 0 \ / m0 \ + | t1 | | 0 1 0 1 | | m1 | + | t2 | | 0 0 -1 1 | | m2 | + | t3 | = | 0 1 -1 1 | \ m3 / + | t4 | | -1 1 -1 1 | + | t5 | | 0 1 0 0 | + \ t6 / \ 0 0 1 0 / + + Note: the two matrices above are the same, but s_i and t_i are used + in the same product, only for i<4, see "A Strassen-like Matrix + Multiplication suited for squaring and higher power computation" by + M. Bodrato, in Proceedings of ISSAC 2010. + + / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \ + | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 | + | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 | + \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 | + | s4*t5 | + | s5*t6 | + \ s6*t4 / + + The scheduling uses two temporaries U0 and U1 to store products, and + two, S0 and T0, to store combinations of entries of the two + operands. +*/ + +/* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3). + * + * Resulting elements are of size up to rn + mn + 1. + * + * Temporary storage: 3 rn + 3 mn + 5. */ +static void +mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, + mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, + mp_ptr tp) +{ + mp_ptr s0, t0, u0, u1; + int r1s, r3s, s0s, t0s, u1s; + s0 = tp; tp += rn + 1; + t0 = tp; tp += mn + 1; + u0 = tp; tp += rn + mn + 1; + u1 = tp; /* rn + mn + 2 */ + + MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */ + r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */ + if (r3s) + { + r1s = abs_sub_n (r1, r1, r3, rn); + r1[rn] = 0; + } + else + { + r1[rn] = mpn_add_n (r1, r1, r3, rn); + r1s = 0; /* r1 - r2 + r3 */ + } + if (r1s) + { + s0[rn] = mpn_add_n (s0, r1, r0, rn); + s0s = 0; + } + else if (r1[rn] != 0) + { + s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn); + s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */ + /* Reverse sign! */ + } + else + { + s0s = abs_sub_n (s0, r0, r1, rn); + s0[rn] = 0; + } + MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */ + r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn); + ASSERT (r0[rn+mn] < 2); /* u0 + u5 */ + + t0s = abs_sub_n (t0, m3, m2, mn); + u1s = r3s^t0s^1; /* Reverse sign! */ + MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */ + u1[rn+mn] = 0; + if (t0s) + { + t0s = abs_sub_n (t0, m1, t0, mn); + t0[mn] = 0; + } + else + { + t0[mn] = mpn_add_n (t0, t0, m1, mn); + } + + /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs + at r3. I'd expect that for matrices of random size, the high + words t0[mn] and r1[rn] are non-zero with a pretty small + probability. If that can be confirmed this should be done as an + unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn])) + add_n. */ + if (t0[mn] != 0) + { + MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */ + ASSERT (r1[rn] < 2); + if (r1[rn] != 0) + mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1); + } + else + { + MUL (r3, r1, rn + 1, t0, mn); + } + + ASSERT (r3[rn+mn] < 4); + + u0[rn+mn] = 0; + if (r1s^t0s) + { + r3s = abs_sub_n (r3, u0, r3, rn + mn + 1); + } + else + { + ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1)); + r3s = 0; /* u3 + u5 */ + } + + if (t0s) + { + t0[mn] = mpn_add_n (t0, t0, m0, mn); + } + else if (t0[mn] != 0) + { + t0[mn] -= mpn_sub_n (t0, t0, m0, mn); + } + else + { + t0s = abs_sub_n (t0, t0, m0, mn); + } + MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */ + ASSERT (u0[rn+mn] < 2); + if (r1s) + { + ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn)); + } + else + { + r1[rn] += mpn_add_n (r1, r1, r2, rn); + } + rn++; + t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn); + /* u3 + u5 + u6 */ + ASSERT (r2[rn+mn-1] < 4); + r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn); + /* -u2 + u3 + u5 */ + ASSERT (r3[rn+mn-1] < 3); + MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */ + ASSERT (u0[rn+mn-1] < 2); + t0[mn] = mpn_add_n (t0, m3, m1, mn); + MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */ + mn += rn; + ASSERT (u1[mn-1] < 4); + ASSERT (u1[mn] == 0); + ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn)); + /* -u2 + u3 - u4 + u5 */ + ASSERT (r1[mn-1] < 2); + if (r3s) + { + ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn)); + } + else + { + ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn)); + /* u1 + u2 - u3 - u5 */ + } + ASSERT (r3[mn-1] < 2); + if (t0s) + { + ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn)); + } + else + { + ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn)); + /* u1 - u3 - u5 - u6 */ + } + ASSERT (r2[mn-1] < 2); +} + +void +mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, + mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, + mp_ptr tp) +{ + if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) + || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) + { + mp_ptr p0, p1; + unsigned i; + + /* Temporary storage: 3 rn + 2 mn */ + p0 = tp + rn; + p1 = p0 + rn + mn; + + for (i = 0; i < 2; i++) + { + MPN_COPY (tp, r0, rn); + + if (rn >= mn) + { + mpn_mul (p0, r0, rn, m0, mn); + mpn_mul (p1, r1, rn, m3, mn); + mpn_mul (r0, r1, rn, m2, mn); + mpn_mul (r1, tp, rn, m1, mn); + } + else + { + mpn_mul (p0, m0, mn, r0, rn); + mpn_mul (p1, m3, mn, r1, rn); + mpn_mul (r0, m2, mn, r1, rn); + mpn_mul (r1, m1, mn, tp, rn); + } + r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn); + r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn); + + r0 = r2; r1 = r3; + } + } + else + mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn, + m0, m1, m2, m3, mn, tp); +} diff --git a/gmp-6.3.0/mpn/generic/matrix22_mul1_inverse_vector.c b/gmp-6.3.0/mpn/generic/matrix22_mul1_inverse_vector.c new file mode 100644 index 0000000..68d50b7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/matrix22_mul1_inverse_vector.c @@ -0,0 +1,64 @@ +/* matrix22_mul1_inverse_vector.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Sets (r;b) = M^{-1}(a;b), with M^{-1} = (u11, -u01; -u10, u00) from + the left. Uses three buffers, to avoid a copy. */ +mp_size_t +mpn_matrix22_mul1_inverse_vector (const struct hgcd_matrix1 *M, + mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) +{ + mp_limb_t h0, h1; + + /* Compute (r;b) <-- (u11 a - u01 b; -u10 a + u00 b) as + + r = u11 * a + r -= u01 * b + b *= u00 + b -= u10 * a + */ + + h0 = mpn_mul_1 (rp, ap, n, M->u[1][1]); + h1 = mpn_submul_1 (rp, bp, n, M->u[0][1]); + ASSERT (h0 == h1); + + h0 = mpn_mul_1 (bp, bp, n, M->u[0][0]); + h1 = mpn_submul_1 (bp, ap, n, M->u[1][0]); + ASSERT (h0 == h1); + + n -= (rp[n-1] | bp[n-1]) == 0; + return n; +} diff --git a/gmp-6.3.0/mpn/generic/mod_1.c b/gmp-6.3.0/mpn/generic/mod_1.c new file mode 100644 index 0000000..f737bc2 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1.c @@ -0,0 +1,278 @@ +/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) -- + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + +Copyright 1991, 1993, 1994, 1999, 2000, 2002, 2007-2009, 2012, 2020 +Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd, + meaning the quotient size where that should happen, the quotient size + being how many udiv divisions will be done. + + The default is to use preinv always, CPUs where this doesn't suit have + tuned thresholds. Note in particular that preinv should certainly be + used if that's the only division available (USE_PREINV_ALWAYS). */ + +#ifndef MOD_1_NORM_THRESHOLD +#define MOD_1_NORM_THRESHOLD 0 +#endif + +#ifndef MOD_1_UNNORM_THRESHOLD +#define MOD_1_UNNORM_THRESHOLD 0 +#endif + +#ifndef MOD_1U_TO_MOD_1_1_THRESHOLD +#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */ +#endif + +#ifndef MOD_1N_TO_MOD_1_1_THRESHOLD +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */ +#endif + +#ifndef MOD_1_1_TO_MOD_1_2_THRESHOLD +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#endif + +#ifndef MOD_1_2_TO_MOD_1_4_THRESHOLD +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20 +#endif + +#if TUNE_PROGRAM_BUILD && !HAVE_NATIVE_mpn_mod_1_1p +/* Duplicates declarations in tune/speed.h */ +mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]); +mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]); + +void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t); +void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t); + +#undef mpn_mod_1_1p +#define mpn_mod_1_1p(ap, n, b, pre) \ + (mod_1_1p_method == 1 ? mpn_mod_1_1p_1 (ap, n, b, pre) \ + : (mod_1_1p_method == 2 ? mpn_mod_1_1p_2 (ap, n, b, pre) \ + : __gmpn_mod_1_1p (ap, n, b, pre))) + +#undef mpn_mod_1_1p_cps +#define mpn_mod_1_1p_cps(pre, b) \ + (mod_1_1p_method == 1 ? mpn_mod_1_1p_cps_1 (pre, b) \ + : (mod_1_1p_method == 2 ? mpn_mod_1_1p_cps_2 (pre, b) \ + : __gmpn_mod_1_1p_cps (pre, b))) +#endif /* TUNE_PROGRAM_BUILD && !HAVE_NATIVE_mpn_mod_1_1p */ + + +/* The comments in mpn/generic/divrem_1.c apply here too. + + As noted in the algorithms section of the manual, the shifts in the loop + for the unnorm case can be avoided by calculating r = a%(d*2^n), followed + by a final (r*2^n)%(d*2^n). In fact if it happens that a%(d*2^n) can + skip a division where (a*2^n)%(d*2^n) can't then there's the same number + of divide steps, though how often that happens depends on the assumed + distributions of dividend and divisor. In any case this idea is left to + CPU specific implementations to consider. */ + +static mp_limb_t +mpn_mod_1_unnorm (mp_srcptr up, mp_size_t un, mp_limb_t d) +{ + mp_size_t i; + mp_limb_t n1, n0, r; + mp_limb_t dummy; + int cnt; + + ASSERT (un > 0); + ASSERT (d != 0); + + /* Skip a division if high < divisor. Having the test here before + normalizing will still skip as often as possible. */ + r = up[un - 1]; + if (r < d) + { + if (--un == 0) + return r; + } + else + r = 0; + + d <<= GMP_NAIL_BITS; + + /* If udiv_qrnnd doesn't need a normalized divisor, can use the simple + code above. */ + if (! UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD)) + { + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_qrnnd (dummy, r, r, n0, d); + r >>= GMP_NAIL_BITS; + } + return r; + } + + count_leading_zeros (cnt, d); + d <<= cnt; + + n1 = up[un - 1] << GMP_NAIL_BITS; + r = (r << cnt) | (n1 >> (GMP_LIMB_BITS - cnt)); + + if (UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD)) + { + mp_limb_t nshift; + for (i = un - 2; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)); + udiv_qrnnd (dummy, r, r, nshift, d); + r >>= GMP_NAIL_BITS; + n1 = n0; + } + udiv_qrnnd (dummy, r, r, n1 << cnt, d); + r >>= GMP_NAIL_BITS; + return r >> cnt; + } + else + { + mp_limb_t inv, nshift; + invert_limb (inv, d); + + for (i = un - 2; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, nshift, d, inv); + r >>= GMP_NAIL_BITS; + n1 = n0; + } + udiv_rnnd_preinv (r, r, n1 << cnt, d, inv); + r >>= GMP_NAIL_BITS; + return r >> cnt; + } +} + +static mp_limb_t +mpn_mod_1_norm (mp_srcptr up, mp_size_t un, mp_limb_t d) +{ + mp_size_t i; + mp_limb_t n0, r; + mp_limb_t dummy; + + ASSERT (un > 0); + + d <<= GMP_NAIL_BITS; + + ASSERT (d & GMP_LIMB_HIGHBIT); + + /* High limb is initial remainder, possibly with one subtract of + d to get r= d) + r -= d; + r >>= GMP_NAIL_BITS; + un--; + if (un == 0) + return r; + + if (BELOW_THRESHOLD (un, MOD_1_NORM_THRESHOLD)) + { + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_qrnnd (dummy, r, r, n0, d); + r >>= GMP_NAIL_BITS; + } + return r; + } + else + { + mp_limb_t inv; + invert_limb (inv, d); + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_rnnd_preinv (r, r, n0, d, inv); + r >>= GMP_NAIL_BITS; + } + return r; + } +} + +mp_limb_t +mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b) +{ + ASSERT (n >= 0); + ASSERT (b != 0); + + /* Should this be handled at all? Rely on callers? Note un==0 is currently + required by mpz/fdiv_r_ui.c and possibly other places. */ + if (n == 0) + return 0; + + if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0)) + { + if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD)) + { + return mpn_mod_1_norm (ap, n, b); + } + else + { + mp_limb_t pre[4]; + mpn_mod_1_1p_cps (pre, b); + return mpn_mod_1_1p (ap, n, b, pre); + } + } + else + { + if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD)) + { + return mpn_mod_1_unnorm (ap, n, b); + } + else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD)) + { + mp_limb_t pre[4]; + mpn_mod_1_1p_cps (pre, b); + return mpn_mod_1_1p (ap, n, b << pre[1], pre); + } + else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4)) + { + mp_limb_t pre[5]; + mpn_mod_1s_2p_cps (pre, b); + return mpn_mod_1s_2p (ap, n, b << pre[1], pre); + } + else + { + mp_limb_t pre[7]; + mpn_mod_1s_4p_cps (pre, b); + return mpn_mod_1s_4p (ap, n, b << pre[1], pre); + } + } +} diff --git a/gmp-6.3.0/mpn/generic/mod_1_1.c b/gmp-6.3.0/mpn/generic/mod_1_1.c new file mode 100644 index 0000000..be199ff --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1_1.c @@ -0,0 +1,341 @@ +/* mpn_mod_1_1p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + + Contributed to the GNU project by Torbjorn Granlund and Niels Möller. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2011, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef MOD_1_1P_METHOD +# define MOD_1_1P_METHOD 1 /* need to make sure this is 2 for asm testing */ +#endif + +/* Define some longlong.h-style macros, but for wider operations. + * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates + * carry out, in the form of a mask. */ + +#if defined (__GNUC__) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %k2\n\t" \ + "adc %4, %k1\n\t" \ + "sbb %k0, %k0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %q2\n\t" \ + "adc %4, %q1\n\t" \ + "sbb %q0, %q0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxcc %r3, %4, %1\n\t" \ + "subx %%g0, %%g0, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addccc %r7, %8, %%g0\n\t" \ + "addccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ + "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#if __VIS__ >= 0x300 +#undef add_mssaaaa +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add%I6c %2, %5, %6\n\t" \ + "adde %1, %3, %4\n\t" \ + "subfe %0, %0, %0\n\t" \ + "nor %0, %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if defined (__s390x__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "algr %2, %6\n\t" \ + "alcgr %1, %4\n\t" \ + "lghi %0, 0\n\t" \ + "alcgr %0, %0\n\t" \ + "lcgr %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC) +#endif + +#if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %5, %6\n\t" \ + "adcs %1, %3, %4\n\t" \ + "movcc %0, #0\n\t" \ + "movcs %0, #-1" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %x5, %6\n\t" \ + "adcs %1, %x3, %x4\n\t" \ + "csinv %0, xzr, xzr, cc\n\t" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC) +#endif +#endif /* defined (__GNUC__) */ + +#ifndef add_mssaaaa +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (m) = - (__c1 + (__s1 < __c0)); \ + } while (0) +#endif + +#if MOD_1_1P_METHOD == 1 +void +mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb; + int cnt; + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b; + if (LIKELY (cnt != 0)) + B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + /* In the normalized case, this can be simplified to + * + * B2modb = - b * bi; + * ASSERT (B2modb <= b); // NB: equality iff b = B/2 + */ + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; +} + +mp_limb_t +mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4]) +{ + mp_limb_t rh, rl, bi, ph, pl, r; + mp_limb_t B1modb, B2modb; + mp_size_t i; + int cnt; + mp_limb_t mask; + + ASSERT (n >= 2); /* fix tuneup.c if this is changed */ + + B1modb = bmodb[2]; + B2modb = bmodb[3]; + + rl = ap[n - 1]; + umul_ppmm (ph, pl, rl, B1modb); + add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]); + + for (i = n - 3; i >= 0; i -= 1) + { + /* rr = ap[i] < B + + LO(rr) * (B mod b) <= (B-1)(b-1) + + HI(rr) * (B^2 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, rl, B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]); + + umul_ppmm (rh, rl, rh, B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + cnt = bmodb[1]; + bi = bmodb[0]; + + if (LIKELY (cnt != 0)) + rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + + mask = -(mp_limb_t) (rh >= b); + rh -= mask & b; + + udiv_rnnd_preinv (r, rh, rl << cnt, b, bi); + + return r >> cnt; +} +#endif /* MOD_1_1P_METHOD == 1 */ + +#if MOD_1_1P_METHOD == 2 +void +mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B2modb; + int cnt; + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + if (LIKELY (cnt != 0)) + { + mp_limb_t B1modb = -b; + B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + } + B2modb = - b * bi; + ASSERT (B2modb <= b); // NB: equality iff b = B/2 + cps[3] = B2modb; +} + +mp_limb_t +mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4]) +{ + int cnt; + mp_limb_t bi, B1modb; + mp_limb_t r0, r1; + mp_limb_t r; + + ASSERT (n >= 2); /* fix tuneup.c if this is changed */ + + r0 = ap[n-2]; + r1 = ap[n-1]; + + if (n > 2) + { + mp_limb_t B2modb, B2mb; + mp_limb_t p0, p1; + mp_limb_t r2; + mp_size_t j; + + B2modb = bmodb[3]; + B2mb = B2modb - b; + + umul_ppmm (p1, p0, r1, B2modb); + add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0); + + for (j = n-4; j >= 0; j--) + { + mp_limb_t cy; + /* mp_limb_t t = r0 + B2mb; */ + umul_ppmm (p1, p0, r1, B2modb); + + ADDC_LIMB (cy, r0, r0, r2 & B2modb); + /* Alternative, for cmov: if (cy) r0 = t; */ + r0 -= (-cy) & b; + add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0); + } + + r1 -= (r2 & b); + } + + cnt = bmodb[1]; + + if (LIKELY (cnt != 0)) + { + mp_limb_t t; + mp_limb_t B1modb = bmodb[2]; + + umul_ppmm (r1, t, r1, B1modb); + r0 += t; + r1 += (r0 < t); + + /* Normalize */ + r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt)); + r0 <<= cnt; + + /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */ + } + else + { + mp_limb_t mask = -(mp_limb_t) (r1 >= b); + r1 -= mask & b; + } + + bi = bmodb[0]; + + udiv_rnnd_preinv (r, r1, r0, b, bi); + return r >> cnt; +} +#endif /* MOD_1_1P_METHOD == 2 */ diff --git a/gmp-6.3.0/mpn/generic/mod_1_2.c b/gmp-6.3.0/mpn/generic/mod_1_2.c new file mode 100644 index 0000000..b00d19e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1_2.c @@ -0,0 +1,148 @@ +/* mpn_mod_1s_2p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + Requires that b < B / 2. + + Contributed to the GNU project by Torbjorn Granlund. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_mod_1s_2p_cps (mp_limb_t cps[5], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb, B3modb; + int cnt; + + ASSERT (b <= (~(mp_limb_t) 0) / 2); + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; + + udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); + cps[4] = B3modb >> cnt; + +#if WANT_ASSERT + { + int i; + b = cps[2]; + for (i = 3; i <= 4; i++) + { + b += cps[i]; + ASSERT (b >= cps[i]); + } + } +#endif +} + +mp_limb_t +mpn_mod_1s_2p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[5]) +{ + mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; + mp_limb_t B1modb, B2modb, B3modb; + mp_size_t i; + int cnt; + + ASSERT (n >= 1); + + B1modb = cps[2]; + B2modb = cps[3]; + B3modb = cps[4]; + + if ((n & 1) != 0) + { + if (n == 1) + { + rl = ap[n - 1]; + bi = cps[0]; + cnt = cps[1]; + udiv_rnnd_preinv (r, rl >> (GMP_LIMB_BITS - cnt), + rl << cnt, b, bi); + return r >> cnt; + } + + umul_ppmm (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); + umul_ppmm (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n--; + } + else + { + rh = ap[n - 1]; + rl = ap[n - 2]; + } + + for (i = n - 4; i >= 0; i -= 2) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + LO(rr) * (B^2 mod b) <= (B-1)(b-1) + + HI(rr) * (B^3 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + + umul_ppmm (ch, cl, rl, B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (rh, rl, rh, B3modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); + + cnt = cps[1]; + bi = cps[0]; + + r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + + return r >> cnt; +} diff --git a/gmp-6.3.0/mpn/generic/mod_1_3.c b/gmp-6.3.0/mpn/generic/mod_1_3.c new file mode 100644 index 0000000..e4a908d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1_3.c @@ -0,0 +1,155 @@ +/* mpn_mod_1s_3p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + Requires that b < B / 3. + + Contributed to the GNU project by Torbjorn Granlund. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_mod_1s_3p_cps (mp_limb_t cps[6], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb, B3modb, B4modb; + int cnt; + + ASSERT (b <= (~(mp_limb_t) 0) / 3); + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; + + udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); + cps[4] = B3modb >> cnt; + + udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi); + cps[5] = B4modb >> cnt; + +#if WANT_ASSERT + { + int i; + b = cps[2]; + for (i = 3; i <= 5; i++) + { + b += cps[i]; + ASSERT (b >= cps[i]); + } + } +#endif +} + +mp_limb_t +mpn_mod_1s_3p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[6]) +{ + mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; + mp_limb_t B1modb, B2modb, B3modb, B4modb; + mp_size_t i; + int cnt; + + ASSERT (n >= 1); + + B1modb = cps[2]; + B2modb = cps[3]; + B3modb = cps[4]; + B4modb = cps[5]; + + /* We compute n mod 3 in a tricky way, which works except for when n is so + close to the maximum size that we don't need to support it. The final + cast to int is a workaround for HP cc. */ + switch ((int) ((mp_limb_t) n * MODLIMB_INVERSE_3 >> (GMP_NUMB_BITS - 2))) + { + case 0: + umul_ppmm (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); + umul_ppmm (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 3; + break; + default: /* n mod 3 = 1; (case 2)*/ + rh = 0; + rl = ap[--n]; + break; + case 1: /* n mod 3 = 2 */ + rh = ap[n - 1]; + rl = ap[n - 2]; + n -= 2; + break; + } + + for (i = n - 3; i >= 0; i -= 3) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + ap[i+2] * (B^2 mod b) <= (B-1)(b-1) + + LO(rr) * (B^3 mod b) <= (B-1)(b-1) + + HI(rr) * (B^4 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + + umul_ppmm (ch, cl, ap[i + 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, rl, B3modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (rh, rl, rh, B4modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); + + cnt = cps[1]; + bi = cps[0]; + + r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + + return r >> cnt; +} diff --git a/gmp-6.3.0/mpn/generic/mod_1_4.c b/gmp-6.3.0/mpn/generic/mod_1_4.c new file mode 100644 index 0000000..80b42ba --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1_4.c @@ -0,0 +1,170 @@ +/* mpn_mod_1s_4p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + Requires that b < B / 4. + + Contributed to the GNU project by Torbjorn Granlund. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; + int cnt; + + ASSERT (b <= (~(mp_limb_t) 0) / 4); + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; + + udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); + cps[4] = B3modb >> cnt; + + udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi); + cps[5] = B4modb >> cnt; + + udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi); + cps[6] = B5modb >> cnt; + +#if WANT_ASSERT + { + int i; + b = cps[2]; + for (i = 3; i <= 6; i++) + { + b += cps[i]; + ASSERT (b >= cps[i]); + } + } +#endif +} + +mp_limb_t +mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7]) +{ + mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; + mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; + mp_size_t i; + int cnt; + + ASSERT (n >= 1); + + B1modb = cps[2]; + B2modb = cps[3]; + B3modb = cps[4]; + B4modb = cps[5]; + B5modb = cps[6]; + + switch (n & 3) + { + case 0: + umul_ppmm (ph, pl, ap[n - 3], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]); + umul_ppmm (ch, cl, ap[n - 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + umul_ppmm (rh, rl, ap[n - 1], B3modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 4; + break; + case 1: + rh = 0; + rl = ap[n - 1]; + n -= 1; + break; + case 2: + rh = ap[n - 1]; + rl = ap[n - 2]; + n -= 2; + break; + case 3: + umul_ppmm (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); + umul_ppmm (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 3; + break; + } + + for (i = n - 4; i >= 0; i -= 4) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + ap[i+2] * (B^2 mod b) <= (B-1)(b-1) + + ap[i+3] * (B^3 mod b) <= (B-1)(b-1) + + LO(rr) * (B^4 mod b) <= (B-1)(b-1) + + HI(rr) * (B^5 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + + umul_ppmm (ch, cl, ap[i + 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, ap[i + 3], B3modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, rl, B4modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (rh, rl, rh, B5modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); + + cnt = cps[1]; + bi = cps[0]; + + r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + + return r >> cnt; +} diff --git a/gmp-6.3.0/mpn/generic/mod_34lsub1.c b/gmp-6.3.0/mpn/generic/mod_34lsub1.c new file mode 100644 index 0000000..af9c6c6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_34lsub1.c @@ -0,0 +1,128 @@ +/* mpn_mod_34lsub1 -- remainder modulo 2^(GMP_NUMB_BITS*3/4)-1. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +/* Calculate a remainder from {p,n} divided by 2^(GMP_NUMB_BITS*3/4)-1. + The remainder is not fully reduced, it's any limb value congruent to + {p,n} modulo that divisor. + + This implementation is only correct when GMP_NUMB_BITS is a multiple of + 4. + + FIXME: If GMP_NAIL_BITS is some silly big value during development then + it's possible the carry accumulators c0,c1,c2 could overflow. + + General notes: + + The basic idea is to use a set of N accumulators (N=3 in this case) to + effectively get a remainder mod 2^(GMP_NUMB_BITS*N)-1 followed at the end + by a reduction to GMP_NUMB_BITS*N/M bits (M=4 in this case) for a + remainder mod 2^(GMP_NUMB_BITS*N/M)-1. N and M are chosen to give a good + set of small prime factors in 2^(GMP_NUMB_BITS*N/M)-1. + + N=3 M=4 suits GMP_NUMB_BITS==32 and GMP_NUMB_BITS==64 quite well, giving + a few more primes than a single accumulator N=1 does, and for no extra + cost (assuming the processor has a decent number of registers). + + For strange nailified values of GMP_NUMB_BITS the idea would be to look + for what N and M give good primes. With GMP_NUMB_BITS not a power of 2 + the choices for M may be opened up a bit. But such things are probably + best done in separate code, not grafted on here. */ + +#if GMP_NUMB_BITS % 4 == 0 + +#define B1 (GMP_NUMB_BITS / 4) +#define B2 (B1 * 2) +#define B3 (B1 * 3) + +#define M1 ((CNST_LIMB(1) << B1) - 1) +#define M2 ((CNST_LIMB(1) << B2) - 1) +#define M3 ((CNST_LIMB(1) << B3) - 1) + +#define LOW0(n) ((n) & M3) +#define HIGH0(n) ((n) >> B3) + +#define LOW1(n) (((n) & M2) << B1) +#define HIGH1(n) ((n) >> B2) + +#define LOW2(n) (((n) & M1) << B2) +#define HIGH2(n) ((n) >> B1) + +#define PARTS0(n) (LOW0(n) + HIGH0(n)) +#define PARTS1(n) (LOW1(n) + HIGH1(n)) +#define PARTS2(n) (LOW2(n) + HIGH2(n)) + +#define ADD(c,a,val) \ + do { \ + mp_limb_t new_c; \ + ADDC_LIMB (new_c, a, a, val); \ + (c) += new_c; \ + } while (0) + +mp_limb_t +mpn_mod_34lsub1 (mp_srcptr p, mp_size_t n) +{ + mp_limb_t c0, c1, c2; + mp_limb_t a0, a1, a2; + + ASSERT (n >= 1); + ASSERT (n/3 < GMP_NUMB_MAX); + + a0 = a1 = a2 = 0; + c0 = c1 = c2 = 0; + + while ((n -= 3) >= 0) + { + ADD (c0, a0, p[0]); + ADD (c1, a1, p[1]); + ADD (c2, a2, p[2]); + p += 3; + } + + if (n != -3) + { + ADD (c0, a0, p[0]); + if (n != -2) + ADD (c1, a1, p[1]); + } + + return + PARTS0 (a0) + PARTS1 (a1) + PARTS2 (a2) + + PARTS1 (c0) + PARTS2 (c1) + PARTS0 (c2); +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/mode1o.c b/gmp-6.3.0/mpn/generic/mode1o.c new file mode 100644 index 0000000..9ba0ae1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mode1o.c @@ -0,0 +1,235 @@ +/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Calculate an r satisfying + + r*B^k + a - c == q*d + + where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1 + (the caller won't know which), and q is the quotient (discarded). d must + be odd, c can be any limb value. + + If c=d then 0<=r<=d. + + This slightly strange function suits the initial Nx1 reduction for GCDs + or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving + -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be + ignored, or for the Jacobi symbol it can be accounted for. The function + also suits divisibility and congruence testing since if r=0 (or r=d) is + obtained then a==c mod d. + + + r is a bit like the remainder returned by mpn_divexact_by3c, and is the + sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r + represents a borrow, since effectively quotient limbs are chosen so that + subtracting that multiple of d from src at each step will produce a zero + limb. + + A long calculation can be done piece by piece from low to high by passing + the return value from one part as the carry parameter to the next part. + The effective final k becomes anything between size and size-n, if n + pieces are used. + + + A similar sort of routine could be constructed based on adding multiples + of d at each limb, much like redc in mpz_powm does. Subtracting however + has a small advantage that when subtracting to cancel out l there's never + a borrow into h, whereas using an addition would put a carry into h + depending whether l==0 or l!=0. + + + In terms of efficiency, this function is similar to a mul-by-inverse + mpn_mod_1. Both are essentially two multiplies and are best suited to + CPUs with low latency multipliers (in comparison to a divide instruction + at least.) But modexact has a few less supplementary operations, only + needs low part and high part multiplies, and has fewer working quantities + (helping CPUs with few registers). + + + In the main loop it will be noted that the new carry (call it r) is the + sum of the high product h and any borrow from l=s-c. If c=B-d+1 and hence will + never have h=d-1 and so r=h+borrow <= d-1. + + When c>=d, on the other hand, h=d-1 can certainly occur together with a + borrow, thereby giving only r<=d, as per the function definition above. + + As a design decision it's left to the caller to check for r=d if it might + be passing c>=d. Several applications have c= 1); + ASSERT (d & 1); + ASSERT_MPN (src, size); + ASSERT_LIMB (d); + ASSERT_LIMB (c); + + if (size == 1) + { + s = src[0]; + if (s > c) + { + l = s-c; + h = l % d; + if (h != 0) + h = d - h; + } + else + { + l = c-s; + h = l % d; + } + return h; + } + + + binvert_limb (inverse, d); + dmul = d << GMP_NAIL_BITS; + + i = 0; + do + { + s = src[i]; + SUBC_LIMB (c, l, s, c); + l = (l * inverse) & GMP_NUMB_MASK; + umul_ppmm (h, dummy, l, dmul); + c += h; + } + while (++i < size-1); + + + s = src[i]; + if (s <= d) + { + /* With high<=d the final step can be a subtract and addback. If c==0 + then the addback will restore to l>=0. If c==d then will get l==d + if s==0, but that's ok per the function definition. */ + + l = c - s; + if (c < s) + l += d; + + ret = l; + } + else + { + /* Can't skip a divide, just do the loop code once more. */ + + SUBC_LIMB (c, l, s, c); + l = (l * inverse) & GMP_NUMB_MASK; + umul_ppmm (h, dummy, l, dmul); + c += h; + ret = c; + } + + ASSERT (orig_c < d ? ret < d : ret <= d); + return ret; +} + + + +#if 0 + +/* The following is an alternate form that might shave one cycle on a + superscalar processor since it takes c+=h off the dependent chain, + leaving just a low product, high product, and a subtract. + + This is for CPU specific implementations to consider. A special case for + highs) could become + c=(x==0xFF..FF) too, if that helped. */ + +mp_limb_t +mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h) +{ + mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2; + mp_limb_t c = 0; + mp_size_t i; + + ASSERT (size >= 1); + ASSERT (d & 1); + + binvert_limb (inverse, d); + dmul = d << GMP_NAIL_BITS; + + for (i = 0; i < size; i++) + { + ASSERT (c==0 || c==1); + + s = src[i]; + SUBC_LIMB (c1, x, s, c); + + SUBC_LIMB (c2, y, x, h); + c = c1 + c2; + + y = (y * inverse) & GMP_NUMB_MASK; + umul_ppmm (h, dummy, y, dmul); + } + + h += c; + return h; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/mu_bdiv_q.c b/gmp-6.3.0/mpn/generic/mu_bdiv_q.c new file mode 100644 index 0000000..0ef3bd8 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_bdiv_q.c @@ -0,0 +1,281 @@ +/* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn. + storing the result in {qp,nn}. Overlap allowed between Q and N; all other + overlap disallowed. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +#include "gmp-impl.h" + + +/* N = {np,nn} + D = {dp,dn} + + Requirements: N >= D + D >= 1 + D odd + dn >= 2 + nn >= 2 + scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn). + + Write quotient to Q = {qp,nn}. + + FIXME: When iterating, perhaps do the small step before loop, not after. + FIXME: Try to avoid the scalar divisions when computing inverse size. + FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In + particular, when dn==in, tp and rp could use the same space. + FIXME: Trim final quotient calculation to qn limbs of precision. +*/ +static void +mpn_mu_bdiv_q_old (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + mp_size_t in; + int cy, c0; + mp_size_t tn, wn; + + qn = nn; + + ASSERT (dn >= 2); + ASSERT (qn >= 2); + + if (qn > dn) + { + mp_size_t b; + + /* |_______________________| dividend + |________| divisor */ + +#define ip scratch /* in */ +#define rp (scratch + in) /* dn or rest >= binvert_itch(in) */ +#define tp (scratch + in + dn) /* dn+in or next_size(dn) */ +#define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */ + + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + + /* Some notes on allocation: + + When in = dn, R dies when mpn_mullo returns, if in < dn the low in + limbs of R dies at that point. We could save memory by letting T live + just under R, and let the upper part of T expand into R. These changes + should reduce itch to perhaps 3dn. + */ + + mpn_binvert (ip, dp, in, rp); + + cy = 0; + + MPN_COPY (rp, np, dn); + np += dn; + mpn_mullo_n (qp, rp, ip, in); + qn -= in; + + while (qn > in) + { + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + qp += in; + if (dn != in) + { + /* Subtract tp[dn-1...in] from partial remainder. */ + cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + } + /* Subtract tp[dn+in-1...dn] from dividend. */ + cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); + np += in; + mpn_mullo_n (qp, rp, ip, in); + qn -= in; + } + + /* Generate last qn limbs. + FIXME: It should be possible to limit precision here, since qn is + typically somewhat smaller than dn. No big gains expected. */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + qp += in; + if (dn != in) + { + cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + } + + mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy); + mpn_mullo_n (qp, rp, ip, qn); + +#undef ip +#undef rp +#undef tp +#undef scratch_out + } + else + { + /* |_______________________| dividend + |________________| divisor */ + +#define ip scratch /* in */ +#define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */ +#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */ + + /* Compute half-sized inverse. */ + in = qn - (qn >> 1); + + mpn_binvert (ip, dp, in, tp); + + mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, qn, qp, in); /* mulhigh */ + else + { + tn = mpn_mulmod_bnm1_next_size (qn); + mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out); + wn = qn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_cmp (tp, np, wn) < 0; + mpn_decr_u (tp + wn, c0); + } + } + + mpn_sub_n (tp, np + in, tp + in, qn - in); + mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */ + +#undef ip +#undef tp +#undef scratch_out + } +} + +void +mpn_mu_bdiv_q (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch); + mpn_neg (qp, qp, nn); +} + +mp_size_t +mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn) +{ + mp_size_t qn, in, tn, itch_binvert, itch_out, itches; + mp_size_t b; + + ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD); + + qn = nn; + + if (qn > dn) + { + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + { + tn = dn + in; + itch_out = 0; + } + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + itch_out = mpn_mulmod_bnm1_itch (tn, dn, in); + } + itches = dn + tn + itch_out; + } + else + { + in = qn - (qn >> 1); + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + { + tn = qn + in; + itch_out = 0; + } + else + { + tn = mpn_mulmod_bnm1_next_size (qn); + itch_out = mpn_mulmod_bnm1_itch (tn, qn, in); + } + itches = tn + itch_out; + } + + itch_binvert = mpn_binvert_itch (in); + return in + MAX (itches, itch_binvert); +} diff --git a/gmp-6.3.0/mpn/generic/mu_bdiv_qr.c b/gmp-6.3.0/mpn/generic/mu_bdiv_qr.c new file mode 100644 index 0000000..540ad73 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_bdiv_qr.c @@ -0,0 +1,312 @@ +/* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn, + where qn = nn-dn, storing the result in {qp,qn}. Overlap allowed between Q + and N; all other overlap disallowed. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +#include "gmp-impl.h" + + +/* N = {np,nn} + D = {dp,dn} + + Requirements: N >= D + D >= 1 + D odd + dn >= 2 + nn >= 2 + scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn). + + Write quotient to Q = {qp,nn-dn}. + + FIXME: When iterating, perhaps do the small step before loop, not after. + FIXME: Try to avoid the scalar divisions when computing inverse size. + FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In + particular, when dn==in, tp and rp could use the same space. +*/ +static mp_limb_t +mpn_mu_bdiv_qr_old (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + mp_size_t in; + mp_limb_t cy, c0; + mp_size_t tn, wn; + + qn = nn - dn; + + ASSERT (dn >= 2); + ASSERT (qn >= 2); + + if (qn > dn) + { + mp_size_t b; + + /* |_______________________| dividend + |________| divisor */ + +#define ip scratch /* in */ +#define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */ +#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */ + + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + + /* Some notes on allocation: + + When in = dn, R dies when mpn_mullo returns, if in < dn the low in + limbs of R dies at that point. We could save memory by letting T live + just under R, and let the upper part of T expand into R. These changes + should reduce itch to perhaps 3dn. + */ + + mpn_binvert (ip, dp, in, tp); + + MPN_COPY (rp, np, dn); + np += dn; + cy = 0; + + while (qn > in) + { + mpn_mullo_n (qp, rp, ip, in); + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + qp += in; + qn -= in; + + if (dn != in) + { + /* Subtract tp[dn-1...in] from partial remainder. */ + cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + } + /* Subtract tp[dn+in-1...dn] from dividend. */ + cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); + np += in; + } + + /* Generate last qn limbs. */ + mpn_mullo_n (qp, rp, ip, qn); + + if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, qn); /* mulhi, need tp[qn+in-1...in] */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out); + wn = dn + qn - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + if (dn != qn) + { + cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + } + return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy); + +#undef ip +#undef tp +#undef scratch_out + } + else + { + /* |_______________________| dividend + |________________| divisor */ + +#define ip scratch /* in */ +#define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */ +#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */ + + /* Compute half-sized inverse. */ + in = qn - (qn >> 1); + + mpn_binvert (ip, dp, in, tp); + + mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* mulhigh */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, np, wn); + mpn_decr_u (tp + wn, c0); + } + } + + qp += in; + qn -= in; + + cy = mpn_sub_n (rp, np + in, tp + in, dn); + mpn_mullo_n (qp, rp, ip, qn); /* high qn quotient limbs */ + + if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, qn); /* mulhigh */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out); + wn = dn + qn - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy); + +#undef ip +#undef tp +#undef scratch_out + } +} + +mp_limb_t +mpn_mu_bdiv_qr (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch); + + /* R' B^{qn} = U - Q' D + * + * Q = B^{qn} - Q' (assuming Q' != 0) + * + * R B^{qn} = U + Q D = U + B^{qn} D - Q' D + * = B^{qn} D + R' + */ + + if (UNLIKELY (!mpn_neg (qp, qp, nn - dn))) + { + /* Zero quotient. */ + ASSERT (cy == 0); + return 0; + } + else + { + mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn); + ASSERT (cy2 >= cy); + + return cy2 - cy; + } +} + + +mp_size_t +mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn) +{ + mp_size_t qn, in, tn, itch_binvert, itch_out, itches; + mp_size_t b; + + ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD); + + qn = nn - dn; + + if (qn > dn) + { + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + } + else + { + in = qn - (qn >> 1); + } + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + { + tn = dn + in; + itch_out = 0; + } + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + itch_out = mpn_mulmod_bnm1_itch (tn, dn, in); + } + + itch_binvert = mpn_binvert_itch (in); + itches = tn + itch_out; + return in + MAX (itches, itch_binvert); +} diff --git a/gmp-6.3.0/mpn/generic/mu_div_q.c b/gmp-6.3.0/mpn/generic/mu_div_q.c new file mode 100644 index 0000000..44cfb40 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_div_q.c @@ -0,0 +1,184 @@ +/* mpn_mu_div_q. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +/* + Things to work on: + + 1. This is a rudimentary implementation of mpn_mu_div_q. The algorithm is + probably close to optimal, except when mpn_mu_divappr_q fails. + + 2. We used to fall back to mpn_mu_div_qr when we detect a possible + mpn_mu_divappr_q rounding problem, now we multiply and compare. + Unfortunately, since mpn_mu_divappr_q does not return the partial + remainder, this also doesn't become optimal. A mpn_mu_divappr_qr could + solve that. + + 3. The allocations done here should be made from the scratch area, which + then would need to be amended. +*/ + +#include /* for NULL */ +#include "gmp-impl.h" + + +mp_limb_t +mpn_mu_div_q (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_ptr tp, rp; + mp_size_t qn; + mp_limb_t cy, qh; + TMP_DECL; + + TMP_MARK; + + qn = nn - dn; + + tp = TMP_BALLOC_LIMBS (qn + 1); + + if (qn >= dn) /* nn >= 2*dn + 1 */ + { + /* |_______________________| dividend + |________| divisor */ + + rp = TMP_BALLOC_LIMBS (nn + 1); + MPN_COPY (rp + 1, np, nn); + rp[0] = 0; + + qh = mpn_cmp (rp + 1 + nn - dn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (rp + 1 + nn - dn, rp + 1 + nn - dn, dp, dn); + + cy = mpn_mu_divappr_q (tp, rp, nn + 1, dp, dn, scratch); + + if (UNLIKELY (cy != 0)) + { + /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was + canonically reduced, replace the returned value of B^(qn-dn)+eps + by the largest possible value. */ + mp_size_t i; + for (i = 0; i < qn + 1; i++) + tp[i] = GMP_NUMB_MAX; + } + + /* The max error of mpn_mu_divappr_q is +4. If the low quotient limb is + smaller than the max error, we cannot trust the quotient. */ + if (tp[0] > 4) + { + MPN_COPY (qp, tp + 1, qn); + } + else + { + mp_limb_t cy; + mp_ptr pp; + + pp = rp; + mpn_mul (pp, tp + 1, qn, dp, dn); + + cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0; + + if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */ + qh -= mpn_sub_1 (qp, tp + 1, qn, 1); + else /* Same as above */ + MPN_COPY (qp, tp + 1, qn); + } + } + else + { + /* |_______________________| dividend + |________________| divisor */ + + /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed + here becomes 2dn, i.e., more than nn. This shouldn't hurt, since only + the most significant dn-1 limbs will actually be read, but it is not + pretty. */ + + qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2, + dp + dn - (qn + 1), qn + 1, scratch); + + /* The max error of mpn_mu_divappr_q is +4, but we get an additional + error from the divisor truncation. */ + if (tp[0] > 6) + { + MPN_COPY (qp, tp + 1, qn); + } + else + { + mp_limb_t cy; + + /* FIXME: a shorter product should be enough; we may use already + allocated space... */ + rp = TMP_BALLOC_LIMBS (nn); + mpn_mul (rp, dp, dn, tp + 1, qn); + + cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0; + + if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */ + qh -= mpn_sub_1 (qp, tp + 1, qn, 1); + else /* Same as above */ + MPN_COPY (qp, tp + 1, qn); + } + } + + TMP_FREE; + return qh; +} + +mp_size_t +mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k) +{ + mp_size_t qn; + + qn = nn - dn; + if (qn >= dn) + { + return mpn_mu_divappr_q_itch (nn + 1, dn, mua_k); + } + else + { + return mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k); + } +} diff --git a/gmp-6.3.0/mpn/generic/mu_div_qr.c b/gmp-6.3.0/mpn/generic/mu_div_qr.c new file mode 100644 index 0000000..8b9c702 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_div_qr.c @@ -0,0 +1,417 @@ +/* mpn_mu_div_qr, mpn_preinv_mu_div_qr. + + Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and + must be normalized, and Q must be nn-dn limbs. The requirement that Q is + nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to + let N be unmodified during the operation. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +/* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync. + + Things to work on: + + * This isn't optimal when the quotient isn't needed, as it might take a lot + of space. The computation is always needed, though, so there is no time to + save with special code. + + * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, + demonstrated by the fact that the mpn_invertappr function's scratch needs + mean that we need to keep a large allocation long after it is needed. + Things are worse as mpn_mul_fft does not accept any scratch parameter, + which means we'll have a large memory hole while in mpn_mul_fft. In + general, a peak scratch need in the beginning of a function isn't + well-handled by the itch/scratch scheme. +*/ + +#ifdef STAT +#undef STAT +#define STAT(x) x +#else +#define STAT(x) +#endif + +#include /* for NULL */ +#include "gmp-impl.h" + + +/* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly. It gives a + speedup according to old measurements, but does the decision mechanism + really make sense? It seem like the quotient between dn and qn might be + what we really should be checking. */ +#ifndef MU_DIV_QR_SKEW_THRESHOLD +#define MU_DIV_QR_SKEW_THRESHOLD 100 +#endif + +#ifdef CHECK /* FIXME: Enable in minithres */ +#undef MU_DIV_QR_SKEW_THRESHOLD +#define MU_DIV_QR_SKEW_THRESHOLD 1 +#endif + + +static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr); +static mp_size_t mpn_mu_div_qr_choose_in (mp_size_t, mp_size_t, int); + + +mp_limb_t +mpn_mu_div_qr (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + mp_limb_t cy, qh; + + qn = nn - dn; + if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn) + { + /* |______________|_ign_first__| dividend nn + |_______|_ign_first__| divisor dn + + |______| quotient (prel) qn + + |___________________| quotient * ignored-divisor-part dn-1 + */ + + /* Compute a preliminary quotient and a partial remainder by dividing the + most significant limbs of each operand. */ + qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1), + np + nn - (2 * qn + 1), 2 * qn + 1, + dp + dn - (qn + 1), qn + 1, + scratch); + + /* Multiply the quotient by the divisor limbs ignored above. */ + if (dn - (qn + 1) > qn) + mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */ + else + mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */ + + if (qh) + cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1)); + else + cy = 0; + scratch[dn - 1] = cy; + + cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1)); + cy = mpn_sub_nc (rp + nn - (2 * qn + 1), + rp + nn - (2 * qn + 1), + scratch + nn - (2 * qn + 1), + qn + 1, cy); + if (cy) + { + qh -= mpn_sub_1 (qp, qp, qn, 1); + mpn_add_n (rp, rp, dp, dn); + } + } + else + { + qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); + } + + return qh; +} + +static mp_limb_t +mpn_mu_div_qr2 (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn, in; + mp_limb_t cy, qh; + mp_ptr ip, tp; + + ASSERT (dn > 1); + + qn = nn - dn; + + /* Compute the inverse size. */ + in = mpn_mu_div_qr_choose_in (qn, dn, 0); + ASSERT (in <= dn); + +#if 1 + /* This alternative inverse computation method gets slightly more accurate + results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function + not adapted (3) mpn_invertappr scratch needs not met. */ + ip = scratch; + tp = scratch + in + 1; + + /* compute an approximate inverse on (in+1) limbs */ + if (dn == in) + { + MPN_COPY (tp + 1, dp, in); + tp[0] = 1; + mpn_invertappr (ip, tp, in + 1, tp + in + 1); + MPN_COPY_INCR (ip, ip + 1, in); + } + else + { + cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); + if (UNLIKELY (cy != 0)) + MPN_ZERO (ip, in); + else + { + mpn_invertappr (ip, tp, in + 1, tp + in + 1); + MPN_COPY_INCR (ip, ip + 1, in); + } + } +#else + /* This older inverse computation method gets slightly worse results than the + one above. */ + ip = scratch; + tp = scratch + in; + + /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the + inversion function should do this automatically. */ + if (dn == in) + { + tp[in + 1] = 0; + MPN_COPY (tp + in + 2, dp, in); + mpn_invertappr (tp, tp + in + 1, in + 1, NULL); + } + else + { + mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL); + } + cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); + if (UNLIKELY (cy != 0)) + MPN_ZERO (tp + 1, in); + MPN_COPY (ip, tp + 1, in); +#endif + + qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in); + + return qh; +} + +mp_limb_t +mpn_preinv_mu_div_qr (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_srcptr ip, + mp_size_t in, + mp_ptr scratch) +{ + mp_size_t qn; + mp_limb_t cy, cx, qh; + mp_limb_t r; + mp_size_t tn, wn; + +#define tp scratch +#define scratch_out (scratch + tn) + + qn = nn - dn; + + np += qn; + qp += qn; + + qh = mpn_cmp (np, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (rp, np, dp, dn); + else + MPN_COPY_INCR (rp, np, dn); + + /* if (qn == 0) */ /* The while below handles this case */ + /* return qh; */ /* Degenerate use. Should we allow this? */ + + while (qn > 0) + { + if (qn < in) + { + ip += in - qn; + in = qn; + } + np -= in; + qp -= in; + + /* Compute the next block of quotient limbs by multiplying the inverse I + by the upper part of the partial remainder R. */ + mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ + cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ + ASSERT_ALWAYS (cy == 0); + + qn -= in; + + /* Compute the product of the quotient block and the divisor D, to be + subtracted from the partial remainder combined with new limbs from the + dividend N. We only really need the low dn+1 limbs. */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn + 1); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + cy = mpn_sub_n (tp, tp, rp + dn - wn, wn); + cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy); + cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0; + ASSERT_ALWAYS (cx >= cy); + mpn_incr_u (tp, cx - cy); + } + } + + r = rp[dn - in] - tp[dn]; + + /* Subtract the product from the partial remainder combined with new + limbs from the dividend N, generating a new partial remainder R. */ + if (dn != in) + { + cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ + cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); + MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ + } + else + { + cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ + } + + STAT (int i; int err = 0; + static int errarr[5]; static int err_rec; static int tot); + + /* Check the remainder R and adjust the quotient as needed. */ + r -= cy; + while (r != 0) + { + /* We loop 0 times with about 69% probability, 1 time with about 31% + probability, 2 times with about 0.6% probability, if inverse is + computed as recommended. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + r -= cy; + STAT (err++); + } + if (mpn_cmp (rp, dp, dn) >= 0) + { + /* This is executed with about 76% probability. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + STAT (err++); + } + + STAT ( + tot++; + errarr[err]++; + if (err > err_rec) + err_rec = err; + if (tot % 0x10000 == 0) + { + for (i = 0; i <= err_rec; i++) + printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); + printf ("\n"); + } + ); + } + + return qh; +} + +/* In case k=0 (automatic choice), we distinguish 3 cases: + (a) dn < qn: in = ceil(qn / ceil(qn/dn)) + (b) dn/3 < qn <= dn: in = ceil(qn / 2) + (c) qn < dn/3: in = qn + In all cases we have in <= dn. + */ +static mp_size_t +mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k) +{ + mp_size_t in; + + if (k == 0) + { + mp_size_t b; + if (qn > dn) + { + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + } + else if (3 * qn > dn) + { + in = (qn - 1) / 2 + 1; /* b = 2 */ + } + else + { + in = (qn - 1) / 1 + 1; /* b = 1 */ + } + } + else + { + mp_size_t xn; + xn = MIN (dn, qn); + in = (xn - 1) / k + 1; + } + + return in; +} + +mp_size_t +mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k) +{ + mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k); + mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in); + mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */ + + ASSERT (itch_preinv >= itch_invapp); + return in + MAX (itch_invapp, itch_preinv); +} + +mp_size_t +mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in) +{ + mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1); + mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in); + + return itch_local + itch_out; +} diff --git a/gmp-6.3.0/mpn/generic/mu_divappr_q.c b/gmp-6.3.0/mpn/generic/mu_divappr_q.c new file mode 100644 index 0000000..0ef7e03 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_divappr_q.c @@ -0,0 +1,368 @@ +/* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q. + + Compute Q = floor(N / D) + e. N is nn limbs, D is dn limbs and must be + normalized, and Q must be nn-dn limbs, 0 <= e <= 4. The requirement that Q + is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us + to let N be unmodified during the operation. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +/* CAUTION: This code and the code in mu_div_qr.c should be edited in sync. + + Things to work on: + + * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, + demonstrated by the fact that the mpn_invertappr function's scratch needs + mean that we need to keep a large allocation long after it is needed. + Things are worse as mpn_mul_fft does not accept any scratch parameter, + which means we'll have a large memory hole while in mpn_mul_fft. In + general, a peak scratch need in the beginning of a function isn't + well-handled by the itch/scratch scheme. +*/ + +#ifdef STAT +#undef STAT +#define STAT(x) x +#else +#define STAT(x) +#endif + +#include /* for NULL */ +#include "gmp-impl.h" + +static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t, + mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr); +static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int); + +mp_limb_t +mpn_mu_divappr_q (mp_ptr qp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn, in; + mp_limb_t cy, qh; + mp_ptr ip, tp; + + ASSERT (dn > 1); + + qn = nn - dn; + + /* If Q is smaller than D, truncate operands. */ + if (qn + 1 < dn) + { + np += dn - (qn + 1); + nn -= dn - (qn + 1); + dp += dn - (qn + 1); + dn = qn + 1; + } + + /* Compute the inverse size. */ + in = mpn_mu_divappr_q_choose_in (qn, dn, 0); + ASSERT (in <= dn); + +#if 1 + /* This alternative inverse computation method gets slightly more accurate + results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function + not adapted (3) mpn_invertappr scratch needs not met. */ + ip = scratch; + tp = scratch + in + 1; + + /* compute an approximate inverse on (in+1) limbs */ + if (dn == in) + { + MPN_COPY (tp + 1, dp, in); + tp[0] = 1; + mpn_invertappr (ip, tp, in + 1, tp + in + 1); + MPN_COPY_INCR (ip, ip + 1, in); + } + else + { + cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); + if (UNLIKELY (cy != 0)) + MPN_ZERO (ip, in); + else + { + mpn_invertappr (ip, tp, in + 1, tp + in + 1); + MPN_COPY_INCR (ip, ip + 1, in); + } + } +#else + /* This older inverse computation method gets slightly worse results than the + one above. */ + ip = scratch; + tp = scratch + in; + + /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the + inversion function should do this automatically. */ + if (dn == in) + { + tp[in + 1] = 0; + MPN_COPY (tp + in + 2, dp, in); + mpn_invertappr (tp, tp + in + 1, in + 1, NULL); + } + else + { + mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL); + } + cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); + if (UNLIKELY (cy != 0)) + MPN_ZERO (tp + 1, in); + MPN_COPY (ip, tp + 1, in); +#endif + + qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in); + + return qh; +} + +static mp_limb_t +mpn_preinv_mu_divappr_q (mp_ptr qp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_srcptr ip, + mp_size_t in, + mp_ptr scratch) +{ + mp_size_t qn; + mp_limb_t cy, cx, qh; + mp_limb_t r; + mp_size_t tn, wn; + +#define rp scratch +#define tp (scratch + dn) +#define scratch_out (scratch + dn + tn) + + qn = nn - dn; + + np += qn; + qp += qn; + + qh = mpn_cmp (np, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (rp, np, dp, dn); + else + MPN_COPY (rp, np, dn); + + if (UNLIKELY (qn == 0)) + return qh; /* Degenerate use. Should we allow this? */ + + for (;;) /* The exit condition (qn == 0) is verified in the loop. */ + { + if (qn < in) + { + ip += in - qn; + in = qn; + } + np -= in; + qp -= in; + + /* Compute the next block of quotient limbs by multiplying the inverse I + by the upper part of the partial remainder R. */ + mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ + cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ + ASSERT_ALWAYS (cy == 0); + + qn -= in; + if (qn == 0) + break; + + /* Compute the product of the quotient block and the divisor D, to be + subtracted from the partial remainder combined with new limbs from the + dividend N. We only really need the low dn limbs. */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn + 1); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + cy = mpn_sub_n (tp, tp, rp + dn - wn, wn); + cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy); + cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0; + ASSERT_ALWAYS (cx >= cy); + mpn_incr_u (tp, cx - cy); + } + } + + r = rp[dn - in] - tp[dn]; + + /* Subtract the product from the partial remainder combined with new + limbs from the dividend N, generating a new partial remainder R. */ + if (dn != in) + { + cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ + cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); + MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ + } + else + { + cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ + } + + STAT (int i; int err = 0; + static int errarr[5]; static int err_rec; static int tot); + + /* Check the remainder R and adjust the quotient as needed. */ + r -= cy; + while (r != 0) + { + /* We loop 0 times with about 69% probability, 1 time with about 31% + probability, 2 times with about 0.6% probability, if inverse is + computed as recommended. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + r -= cy; + STAT (err++); + } + if (mpn_cmp (rp, dp, dn) >= 0) + { + /* This is executed with about 76% probability. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + STAT (err++); + } + + STAT ( + tot++; + errarr[err]++; + if (err > err_rec) + err_rec = err; + if (tot % 0x10000 == 0) + { + for (i = 0; i <= err_rec; i++) + printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); + printf ("\n"); + } + ); + } + + /* FIXME: We should perhaps be somewhat more elegant in our rounding of the + quotient. For now, just make sure the returned quotient is >= the real + quotient; add 3 with saturating arithmetic. */ + qn = nn - dn; + cy += mpn_add_1 (qp, qp, qn, 3); + if (cy != 0) + { + if (qh != 0) + { + /* Return a quotient of just 1-bits, with qh set. */ + mp_size_t i; + for (i = 0; i < qn; i++) + qp[i] = GMP_NUMB_MAX; + } + else + { + /* Propagate carry into qh. */ + qh = 1; + } + } + + return qh; +} + +/* In case k=0 (automatic choice), we distinguish 3 cases: + (a) dn < qn: in = ceil(qn / ceil(qn/dn)) + (b) dn/3 < qn <= dn: in = ceil(qn / 2) + (c) qn < dn/3: in = qn + In all cases we have in <= dn. + */ +static mp_size_t +mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k) +{ + mp_size_t in; + + if (k == 0) + { + mp_size_t b; + if (qn > dn) + { + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + } + else if (3 * qn > dn) + { + in = (qn - 1) / 2 + 1; /* b = 2 */ + } + else + { + in = (qn - 1) / 1 + 1; /* b = 1 */ + } + } + else + { + mp_size_t xn; + xn = MIN (dn, qn); + in = (xn - 1) / k + 1; + } + + return in; +} + +mp_size_t +mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k) +{ + mp_size_t qn, in, itch_local, itch_out, itch_invapp; + + qn = nn - dn; + if (qn + 1 < dn) + { + dn = qn + 1; + } + in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k); + + itch_local = mpn_mulmod_bnm1_next_size (dn + 1); + itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in); + itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */ + + ASSERT (dn + itch_local + itch_out >= itch_invapp); + return in + MAX (dn + itch_local + itch_out, itch_invapp); +} diff --git a/gmp-6.3.0/mpn/generic/mul.c b/gmp-6.3.0/mpn/generic/mul.c new file mode 100644 index 0000000..37444e9 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul.c @@ -0,0 +1,441 @@ +/* mpn_mul -- Multiply two natural numbers. + + Contributed to the GNU project by Torbjorn Granlund. + +Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012, +2014, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#ifndef MUL_BASECASE_MAX_UN +#define MUL_BASECASE_MAX_UN 500 +#endif + +/* Areas where the different toom algorithms can be called (extracted + from the t-toom*.c files, and ignoring small constant offsets): + + 1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un + 4/7 6/7 + 6/11 + |--------------------| toom22 (small) + || toom22 (large) + |xxxx| toom22 called + |-------------------------------------| toom32 + |xxxxxxxxxxxxxxxx| | toom32 called + |------------| toom33 + |x| toom33 called + |---------------------------------| | toom42 + |xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called + |--------------------| toom43 + |xxxxxxxxxx| toom43 called + |-----------------------------| toom52 (unused) + |--------| toom44 + |xxxxxxxx| toom44 called + |--------------------| | toom53 + |xxxxxx| toom53 called + |-------------------------| toom62 (unused) + |----------------| toom54 (unused) + |--------------------| toom63 + |xxxxxxxxx| | toom63 called + |---------------------------------| toom6h + |xxxxxxxx| toom6h called + |-------------------------| toom8h (32 bit) + |------------------------------------------| toom8h (64 bit) + |xxxxxxxx| toom8h called +*/ + +#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn) +#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn) + +/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v + (pointed to by VP, with VN limbs), and store the result at PRODP. The + result is UN + VN limbs. Return the most significant limb of the result. + + NOTE: The space pointed to by PRODP is overwritten before finished with U + and V, so overlap is an error. + + Argument constraints: + 1. UN >= VN. + 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from + the multiplier and the multiplicand. */ + +/* + * The cutoff lines in the toomX2 and toomX3 code are now exactly between the + ideal lines of the surrounding algorithms. Is that optimal? + + * The toomX3 code now uses a structure similar to the one of toomX2, except + that it loops longer in the unbalanced case. The result is that the + remaining area might have un < vn. Should we fix the toomX2 code in a + similar way? + + * The toomX3 code is used for the largest non-FFT unbalanced operands. It + therefore calls mpn_mul recursively for certain cases. + + * Allocate static temp space using THRESHOLD variables (except for toom44 + when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all. + + * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42 + have the same vn threshold. This is not true, we should actually use + mul_basecase for slightly larger operands for toom32 than for toom22, and + even larger for toom42. + + * That problem is even more prevalent for toomX3. We therefore use special + THRESHOLD variables there. +*/ + +mp_limb_t +mpn_mul (mp_ptr prodp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +{ + ASSERT (un >= vn); + ASSERT (vn >= 1); + ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); + ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); + + if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD)) + { + /* When un (and thus vn) is below the toom22 range, do mul_basecase. + Test un and not vn here not to thwart the un >> vn code below. + This special case is not necessary, but cuts the overhead for the + smallest operands. */ + mpn_mul_basecase (prodp, up, un, vp, vn); + } + else if (un == vn) + { + mpn_mul_n (prodp, up, vp, un); + } + else if (vn < MUL_TOOM22_THRESHOLD) + { /* plain schoolbook multiplication */ + + /* Unless un is very large, or else if have an applicable mpn_mul_N, + perform basecase multiply directly. */ + if (un <= MUL_BASECASE_MAX_UN +#if HAVE_NATIVE_mpn_mul_2 + || vn <= 2 +#else + || vn == 1 +#endif + ) + mpn_mul_basecase (prodp, up, un, vp, vn); + else + { + /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory + locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply + these pieces with the vp[] operand. After each such partial + multiplication (but the last) we copy the most significant vn + limbs into a temporary buffer since that part would otherwise be + overwritten by the next multiplication. After the next + multiplication, we add it back. This illustrates the situation: + + -->vn<-- + | |<------- un ------->| + _____________________| + X /| + /XX__________________/ | + _____________________ | + X / | + /XX__________________/ | + _____________________ | + / / | + /____________________/ | + ================================================================== + + The parts marked with X are the parts whose sums are copied into + the temporary buffer. */ + + mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT]; + mp_limb_t cy; + ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT); + + mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); + prodp += MUL_BASECASE_MAX_UN; + MPN_COPY (tp, prodp, vn); /* preserve high triangle */ + up += MUL_BASECASE_MAX_UN; + un -= MUL_BASECASE_MAX_UN; + while (un > MUL_BASECASE_MAX_UN) + { + mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); + cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ + mpn_incr_u (prodp + vn, cy); + prodp += MUL_BASECASE_MAX_UN; + MPN_COPY (tp, prodp, vn); /* preserve high triangle */ + up += MUL_BASECASE_MAX_UN; + un -= MUL_BASECASE_MAX_UN; + } + if (un > vn) + { + mpn_mul_basecase (prodp, up, un, vp, vn); + } + else + { + ASSERT (un > 0); + mpn_mul_basecase (prodp, vp, vn, up, un); + } + cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ + mpn_incr_u (prodp + vn, cy); + } + } + else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD)) + { + /* Use ToomX2 variants */ + mp_ptr scratch; + TMP_SDECL; TMP_SMARK; + +#define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2) + scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2); + ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */ + ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */ + ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */ +#undef ITCH_TOOMX2 + + /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn + square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly + wise; we would get better balance by slightly moving the bound. We + will sometimes end up with un < vn, like in the X3 arm below. */ + if (un >= 3 * vn) + { + mp_limb_t cy; + mp_ptr ws; + + /* The maximum ws usage is for the mpn_mul result. */ + ws = TMP_SALLOC_LIMBS (4 * vn); + + mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + prodp += 2 * vn; + + while (un >= 3 * vn) + { + mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, 2 * vn); + mpn_incr_u (prodp + vn, cy); + prodp += 2 * vn; + } + + /* vn <= un < 3vn */ + + if (4 * un < 5 * vn) + mpn_toom22_mul (ws, up, un, vp, vn, scratch); + else if (4 * un < 7 * vn) + mpn_toom32_mul (ws, up, un, vp, vn, scratch); + else + mpn_toom42_mul (ws, up, un, vp, vn, scratch); + + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + } + else + { + if (4 * un < 5 * vn) + mpn_toom22_mul (prodp, up, un, vp, vn, scratch); + else if (4 * un < 7 * vn) + mpn_toom32_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom42_mul (prodp, up, un, vp, vn, scratch); + } + TMP_SFREE; + } + else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) || + BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD)) + { + /* Handle the largest operands that are not in the FFT range. The 2nd + condition makes very unbalanced operands avoid the FFT code (except + perhaps as coefficient products of the Toom code. */ + + if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn)) + { + /* Use ToomX3 variants */ + mp_ptr scratch; + TMP_DECL; TMP_MARK; + +#define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS) + scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3); + ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */ + ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */ + ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */ + ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */ + ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */ + ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */ +#undef ITCH_TOOMX3 + + if (2 * un >= 5 * vn) + { + mp_limb_t cy; + mp_ptr ws; + + /* The maximum ws usage is for the mpn_mul result. */ + ws = TMP_ALLOC_LIMBS (7 * vn >> 1); + + if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) + mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); + else + mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + prodp += 2 * vn; + + while (2 * un >= 5 * vn) /* un >= 2.5vn */ + { + if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) + mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); + else + mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, 2 * vn); + mpn_incr_u (prodp + vn, cy); + prodp += 2 * vn; + } + + /* vn / 2 <= un < 2.5vn */ + + if (un < vn) + mpn_mul (ws, vp, vn, up, un); + else + mpn_mul (ws, up, un, vp, vn); + + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + } + else + { + if (6 * un < 7 * vn) + mpn_toom33_mul (prodp, up, un, vp, vn, scratch); + else if (2 * un < 3 * vn) + { + if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD)) + mpn_toom32_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom43_mul (prodp, up, un, vp, vn, scratch); + } + else if (6 * un < 11 * vn) + { + if (4 * un < 7 * vn) + { + if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD)) + mpn_toom32_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom53_mul (prodp, up, un, vp, vn, scratch); + } + else + { + if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD)) + mpn_toom42_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom53_mul (prodp, up, un, vp, vn, scratch); + } + } + else + { + if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) + mpn_toom42_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom63_mul (prodp, up, un, vp, vn, scratch); + } + } + TMP_FREE; + } + else + { + mp_ptr scratch; + TMP_DECL; TMP_MARK; + + if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD)) + { + scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn)); + mpn_toom44_mul (prodp, up, un, vp, vn, scratch); + } + else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD)) + { + scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn)); + mpn_toom6h_mul (prodp, up, un, vp, vn, scratch); + } + else + { + scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn)); + mpn_toom8h_mul (prodp, up, un, vp, vn, scratch); + } + TMP_FREE; + } + } + else + { + if (un >= 8 * vn) + { + mp_limb_t cy; + mp_ptr ws; + TMP_DECL; TMP_MARK; + + /* The maximum ws usage is for the mpn_mul result. */ + ws = TMP_BALLOC_LIMBS (9 * vn >> 1); + + mpn_fft_mul (prodp, up, 3 * vn, vp, vn); + un -= 3 * vn; + up += 3 * vn; + prodp += 3 * vn; + + while (2 * un >= 7 * vn) /* un >= 3.5vn */ + { + mpn_fft_mul (ws, up, 3 * vn, vp, vn); + un -= 3 * vn; + up += 3 * vn; + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, 3 * vn); + mpn_incr_u (prodp + vn, cy); + prodp += 3 * vn; + } + + /* vn / 2 <= un < 3.5vn */ + + if (un < vn) + mpn_mul (ws, vp, vn, up, un); + else + mpn_mul (ws, up, un, vp, vn); + + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + + TMP_FREE; + } + else + mpn_fft_mul (prodp, up, un, vp, vn); + } + + return prodp[un + vn - 1]; /* historic */ +} diff --git a/gmp-6.3.0/mpn/generic/mul_1.c b/gmp-6.3.0/mpn/generic/mul_1.c new file mode 100644 index 0000000..52d46da --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul_1.c @@ -0,0 +1,96 @@ +/* mpn_mul_1 -- Multiply a limb vector with a single limb and store the + product in a second limb vector. + +Copyright 1991-1994, 1996, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t ul, cl, hpl, lpl; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + + cl = 0; + do + { + ul = *up++; + umul_ppmm (hpl, lpl, ul, vl); + + lpl += cl; + cl = (lpl < cl) + hpl; + + *rp++ = lpl; + } + while (--n != 0); + + return cl; +} + +#endif + +#if GMP_NAIL_BITS >= 1 + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t shifted_vl, ul, lpl, hpl, prev_hpl, xw, cl, xl; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT_MPN (up, n); + ASSERT_LIMB (vl); + + shifted_vl = vl << GMP_NAIL_BITS; + cl = 0; + prev_hpl = 0; + do + { + ul = *up++; + + umul_ppmm (hpl, lpl, ul, shifted_vl); + lpl >>= GMP_NAIL_BITS; + xw = prev_hpl + lpl + cl; + cl = xw >> GMP_NUMB_BITS; + xl = xw & GMP_NUMB_MASK; + *rp++ = xl; + prev_hpl = hpl; + } + while (--n != 0); + + return prev_hpl + cl; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/mul_basecase.c b/gmp-6.3.0/mpn/generic/mul_basecase.c new file mode 100644 index 0000000..2487fba --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul_basecase.c @@ -0,0 +1,165 @@ +/* mpn_mul_basecase -- Internal routine to multiply two natural numbers + of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright 1991-1994, 1996, 1997, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Multiply {up,usize} by {vp,vsize} and write the result to + {prodp,usize+vsize}. Must have usize>=vsize. + + Note that prodp gets usize+vsize limbs stored, even if the actual result + only needs usize+vsize-1. + + There's no good reason to call here with vsize>=MUL_TOOM22_THRESHOLD. + Currently this is allowed, but it might not be in the future. + + This is the most critical code for multiplication. All multiplies rely + on this, both small and huge. Small ones arrive here immediately, huge + ones arrive here as this is the base case for Karatsuba's recursive + algorithm. */ + +void +mpn_mul_basecase (mp_ptr rp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +{ + ASSERT (un >= vn); + ASSERT (vn >= 1); + ASSERT (! MPN_OVERLAP_P (rp, un+vn, up, un)); + ASSERT (! MPN_OVERLAP_P (rp, un+vn, vp, vn)); + + /* We first multiply by the low order limb (or depending on optional function + availability, limbs). This result can be stored, not added, to rp. We + also avoid a loop for zeroing this way. */ + +#if HAVE_NATIVE_mpn_mul_2 + if (vn >= 2) + { + rp[un + 1] = mpn_mul_2 (rp, up, un, vp); + rp += 2, vp += 2, vn -= 2; + } + else + { + rp[un] = mpn_mul_1 (rp, up, un, vp[0]); + return; + } +#else + rp[un] = mpn_mul_1 (rp, up, un, vp[0]); + rp += 1, vp += 1, vn -= 1; +#endif + + /* Now accumulate the product of up[] and the next higher limb (or depending + on optional function availability, limbs) from vp[]. */ + +#define MAX_LEFT MP_SIZE_T_MAX /* Used to simplify loops into if statements */ + + +#if HAVE_NATIVE_mpn_addmul_6 + while (vn >= 6) + { + rp[un + 6 - 1] = mpn_addmul_6 (rp, up, un, vp); + if (MAX_LEFT == 6) + return; + rp += 6, vp += 6, vn -= 6; + if (MAX_LEFT < 2 * 6) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (6 - 1) +#endif + +#if HAVE_NATIVE_mpn_addmul_5 + while (vn >= 5) + { + rp[un + 5 - 1] = mpn_addmul_5 (rp, up, un, vp); + if (MAX_LEFT == 5) + return; + rp += 5, vp += 5, vn -= 5; + if (MAX_LEFT < 2 * 5) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (5 - 1) +#endif + +#if HAVE_NATIVE_mpn_addmul_4 + while (vn >= 4) + { + rp[un + 4 - 1] = mpn_addmul_4 (rp, up, un, vp); + if (MAX_LEFT == 4) + return; + rp += 4, vp += 4, vn -= 4; + if (MAX_LEFT < 2 * 4) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (4 - 1) +#endif + +#if HAVE_NATIVE_mpn_addmul_3 + while (vn >= 3) + { + rp[un + 3 - 1] = mpn_addmul_3 (rp, up, un, vp); + if (MAX_LEFT == 3) + return; + rp += 3, vp += 3, vn -= 3; + if (MAX_LEFT < 2 * 3) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (3 - 1) +#endif + +#if HAVE_NATIVE_mpn_addmul_2 + while (vn >= 2) + { + rp[un + 2 - 1] = mpn_addmul_2 (rp, up, un, vp); + if (MAX_LEFT == 2) + return; + rp += 2, vp += 2, vn -= 2; + if (MAX_LEFT < 2 * 2) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (2 - 1) +#endif + + while (vn >= 1) + { + rp[un] = mpn_addmul_1 (rp, up, un, vp[0]); + if (MAX_LEFT == 1) + return; + rp += 1, vp += 1, vn -= 1; + } +} diff --git a/gmp-6.3.0/mpn/generic/mul_fft.c b/gmp-6.3.0/mpn/generic/mul_fft.c new file mode 100644 index 0000000..76a2106 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul_fft.c @@ -0,0 +1,1105 @@ +/* Schoenhage's fast multiplication modulo 2^N+1. + + Contributed by Paul Zimmermann. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1998-2010, 2012, 2013, 2018, 2020, 2022 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* References: + + Schnelle Multiplikation grosser Zahlen, by Arnold Schoenhage and Volker + Strassen, Computing 7, p. 281-292, 1971. + + Asymptotically fast algorithms for the numerical multiplication and division + of polynomials with complex coefficients, by Arnold Schoenhage, Computer + Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982. + + Tapes versus Pointers, a study in implementing fast algorithms, by Arnold + Schoenhage, Bulletin of the EATCS, 30, p. 23-32, 1986. + + TODO: + + Implement some of the tricks published at ISSAC'2007 by Gaudry, Kruppa, and + Zimmermann. + + It might be possible to avoid a small number of MPN_COPYs by using a + rotating temporary or two. + + Cleanup and simplify the code! +*/ + +#ifdef TRACE +#undef TRACE +#define TRACE(x) x +#include +#else +#define TRACE(x) +#endif + +#include "gmp-impl.h" + +#ifdef WANT_ADDSUB +#include "generic/add_n_sub_n.c" +#define HAVE_NATIVE_mpn_add_n_sub_n 1 +#endif + +static mp_limb_t mpn_mul_fft_internal (mp_ptr, mp_size_t, int, mp_ptr *, + mp_ptr *, mp_ptr, mp_ptr, mp_size_t, + mp_size_t, mp_size_t, int **, mp_ptr, int); +static void mpn_mul_fft_decompose (mp_ptr, mp_ptr *, mp_size_t, mp_size_t, mp_srcptr, + mp_size_t, mp_size_t, mp_size_t, mp_ptr); + + +/* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n. + We have sqr=0 if for a multiply, sqr=1 for a square. + There are three generations of this code; we keep the old ones as long as + some gmp-mparam.h is not updated. */ + + +/*****************************************************************************/ + +#if TUNE_PROGRAM_BUILD || (defined (MUL_FFT_TABLE3) && defined (SQR_FFT_TABLE3)) + +#ifndef FFT_TABLE3_SIZE /* When tuning this is defined in gmp-impl.h */ +#if defined (MUL_FFT_TABLE3_SIZE) && defined (SQR_FFT_TABLE3_SIZE) +#if MUL_FFT_TABLE3_SIZE > SQR_FFT_TABLE3_SIZE +#define FFT_TABLE3_SIZE MUL_FFT_TABLE3_SIZE +#else +#define FFT_TABLE3_SIZE SQR_FFT_TABLE3_SIZE +#endif +#endif +#endif + +#ifndef FFT_TABLE3_SIZE +#define FFT_TABLE3_SIZE 200 +#endif + +FFT_TABLE_ATTRS struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE] = +{ + MUL_FFT_TABLE3, + SQR_FFT_TABLE3 +}; + +int +mpn_fft_best_k (mp_size_t n, int sqr) +{ + const struct fft_table_nk *fft_tab, *tab; + mp_size_t tab_n, thres; + int last_k; + + fft_tab = mpn_fft_table3[sqr]; + last_k = fft_tab->k; + for (tab = fft_tab + 1; ; tab++) + { + tab_n = tab->n; + thres = tab_n << last_k; + if (n <= thres) + break; + last_k = tab->k; + } + return last_k; +} + +#define MPN_FFT_BEST_READY 1 +#endif + +/*****************************************************************************/ + +#if ! defined (MPN_FFT_BEST_READY) +FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = +{ + MUL_FFT_TABLE, + SQR_FFT_TABLE +}; + +int +mpn_fft_best_k (mp_size_t n, int sqr) +{ + int i; + + for (i = 0; mpn_fft_table[sqr][i] != 0; i++) + if (n < mpn_fft_table[sqr][i]) + return i + FFT_FIRST_K; + + /* treat 4*last as one further entry */ + if (i == 0 || n < 4 * mpn_fft_table[sqr][i - 1]) + return i + FFT_FIRST_K; + else + return i + FFT_FIRST_K + 1; +} +#endif + +/*****************************************************************************/ + + +/* Returns smallest possible number of limbs >= pl for a fft of size 2^k, + i.e. smallest multiple of 2^k >= pl. + + Don't declare static: needed by tuneup. +*/ + +mp_size_t +mpn_fft_next_size (mp_size_t pl, int k) +{ + pl = 1 + ((pl - 1) >> k); /* ceil (pl/2^k) */ + return pl << k; +} + + +/* Initialize l[i][j] with bitrev(j) */ +static void +mpn_fft_initl (int **l, int k) +{ + int i, j, K; + int *li; + + l[0][0] = 0; + for (i = 1, K = 1; i <= k; i++, K *= 2) + { + li = l[i]; + for (j = 0; j < K; j++) + { + li[j] = 2 * l[i - 1][j]; + li[K + j] = 1 + li[j]; + } + } +} + + +/* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1} + Assumes a is semi-normalized, i.e. a[n] <= 1. + r and a must have n+1 limbs, and not overlap. +*/ +static void +mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t d, mp_size_t n) +{ + unsigned int sh; + mp_size_t m; + mp_limb_t cc, rd; + + sh = d % GMP_NUMB_BITS; + m = d / GMP_NUMB_BITS; + + if (m >= n) /* negate */ + { + /* r[0..m-1] <-- lshift(a[n-m]..a[n-1], sh) + r[m..n-1] <-- -lshift(a[0]..a[n-m-1], sh) */ + + m -= n; + if (sh != 0) + { + /* no out shift below since a[n] <= 1 */ + mpn_lshift (r, a + n - m, m + 1, sh); + rd = r[m]; + cc = mpn_lshiftc (r + m, a, n - m, sh); + } + else + { + MPN_COPY (r, a + n - m, m); + rd = a[n]; + mpn_com (r + m, a, n - m); + cc = 0; + } + + /* add cc to r[0], and add rd to r[m] */ + + /* now add 1 in r[m], subtract 1 in r[n], i.e. add 1 in r[0] */ + + r[n] = 0; + /* cc < 2^sh <= 2^(GMP_NUMB_BITS-1) thus no overflow here */ + ++cc; + MPN_INCR_U (r, n + 1, cc); + + ++rd; + /* rd might overflow when sh=GMP_NUMB_BITS-1 */ + cc = rd + (rd == 0); + r = r + m + (rd == 0); + MPN_INCR_U (r, n + 1 - m - (rd == 0), cc); + } + else + { + /* r[0..m-1] <-- -lshift(a[n-m]..a[n-1], sh) + r[m..n-1] <-- lshift(a[0]..a[n-m-1], sh) */ + if (sh != 0) + { + /* no out bits below since a[n] <= 1 */ + mpn_lshiftc (r, a + n - m, m + 1, sh); + rd = ~r[m]; + /* {r, m+1} = {a+n-m, m+1} << sh */ + cc = mpn_lshift (r + m, a, n - m, sh); /* {r+m, n-m} = {a, n-m}<GMP_NUMB_MAX+1. Never triggered. + Is it actually possible? */ + r[n] = 0; + MPN_INCR_U (r, n + 1, cy); + } + } +} + +#if HAVE_NATIVE_mpn_add_n_sub_n +static inline void +mpn_fft_add_sub_modF (mp_ptr A0, mp_ptr Ai, mp_srcptr tp, mp_size_t n) +{ + mp_limb_t cyas, c, x; + + cyas = mpn_add_n_sub_n (A0, Ai, A0, tp, n); + + c = A0[n] - tp[n] - (cyas & 1); + x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0); + Ai[n] = x + c; + MPN_INCR_U (Ai, n + 1, x); + + c = A0[n] + tp[n] + (cyas >> 1); + x = (c - 1) & -(c != 0); + A0[n] = c - x; + MPN_DECR_U (A0, n + 1, x); +} + +#else /* ! HAVE_NATIVE_mpn_add_n_sub_n */ + +/* r <- a+b mod 2^(n*GMP_NUMB_BITS)+1. + Assumes a and b are semi-normalized. +*/ +static inline void +mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n) +{ + mp_limb_t c, x; + + c = a[n] + b[n] + mpn_add_n (r, a, b, n); + /* 0 <= c <= 3 */ + +#if 1 + /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch. The + result is slower code, of course. But the following outsmarts GCC. */ + x = (c - 1) & -(c != 0); + r[n] = c - x; + MPN_DECR_U (r, n + 1, x); +#endif +#if 0 + if (c > 1) + { + r[n] = 1; /* r[n] - c = 1 */ + MPN_DECR_U (r, n + 1, c - 1); + } + else + { + r[n] = c; + } +#endif +} + +/* r <- a-b mod 2^(n*GMP_NUMB_BITS)+1. + Assumes a and b are semi-normalized. +*/ +static inline void +mpn_fft_sub_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n) +{ + mp_limb_t c, x; + + c = a[n] - b[n] - mpn_sub_n (r, a, b, n); + /* -2 <= c <= 1 */ + +#if 1 + /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch. The + result is slower code, of course. But the following outsmarts GCC. */ + x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0); + r[n] = x + c; + MPN_INCR_U (r, n + 1, x); +#endif +#if 0 + if ((c & GMP_LIMB_HIGHBIT) != 0) + { + r[n] = 0; + MPN_INCR_U (r, n + 1, -c); + } + else + { + r[n] = c; + } +#endif +} +#endif /* HAVE_NATIVE_mpn_add_n_sub_n */ + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */ + +static void +mpn_fft_fft (mp_ptr *Ap, mp_size_t K, int **ll, + mp_size_t omega, mp_size_t n, mp_size_t inc, mp_ptr tp) +{ + if (K == 2) + { + mp_limb_t cy; +#if HAVE_NATIVE_mpn_add_n_sub_n + cy = mpn_add_n_sub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1; +#else + MPN_COPY (tp, Ap[0], n + 1); + mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1); + cy = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1); +#endif + if (Ap[0][n] > 1) /* can be 2 or 3 */ + { /* Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1); */ + mp_limb_t cc = Ap[0][n] - 1; + Ap[0][n] = 1; + MPN_DECR_U (Ap[0], n + 1, cc); + } + if (cy) /* Ap[inc][n] can be -1 or -2 */ + { /* Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + 1); */ + mp_limb_t cc = ~Ap[inc][n] + 1; + Ap[inc][n] = 0; + MPN_INCR_U (Ap[inc], n + 1, cc); + } + } + else + { + mp_size_t j, K2 = K >> 1; + int *lk = *ll; + + mpn_fft_fft (Ap, K2, ll-1, 2 * omega, n, inc * 2, tp); + mpn_fft_fft (Ap+inc, K2, ll-1, 2 * omega, n, inc * 2, tp); + /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc] + A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */ + for (j = 0; j < K2; j++, lk += 2, Ap += 2 * inc) + { + /* Ap[inc] <- Ap[0] + Ap[inc] * 2^(lk[1] * omega) + Ap[0] <- Ap[0] + Ap[inc] * 2^(lk[0] * omega) */ + mpn_fft_mul_2exp_modF (tp, Ap[inc], lk[0] * omega, n); +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_fft_add_sub_modF (Ap[0], Ap[inc], tp, n); +#else + mpn_fft_sub_modF (Ap[inc], Ap[0], tp, n); + mpn_fft_add_modF (Ap[0], Ap[0], tp, n); +#endif + } + } +} + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 + tp must have space for 2*(n+1) limbs. +*/ + + +/* Given ap[0..n] with ap[n]<=1, reduce it modulo 2^(n*GMP_NUMB_BITS)+1, + by subtracting that modulus if necessary. + + If ap[0..n] is exactly 2^(n*GMP_NUMB_BITS) then mpn_sub_1 produces a + borrow and the limbs must be zeroed out again. This will occur very + infrequently. */ + +static inline void +mpn_fft_normalize (mp_ptr ap, mp_size_t n) +{ + if (ap[n] != 0) + { + MPN_DECR_U (ap, n + 1, CNST_LIMB(1)); + if (ap[n] == 0) + { + /* This happens with very low probability; we have yet to trigger it, + and thereby make sure this code is correct. */ + MPN_ZERO (ap, n); + ap[n] = 1; + } + else + ap[n] = 0; + } +} + +/* a[i] <- a[i]*b[i] mod 2^(n*GMP_NUMB_BITS)+1 for 0 <= i < K */ +static void +mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K) +{ + int i; + unsigned k; + int sqr = (ap == bp); + TMP_DECL; + + TMP_MARK; + + if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t K2, nprime2, Nprime2, M2, maxLK, l, Mp2; + int k; + int **fft_l, *tmp; + mp_ptr *Ap, *Bp, A, B, T; + + k = mpn_fft_best_k (n, sqr); + K2 = (mp_size_t) 1 << k; + ASSERT_ALWAYS((n & (K2 - 1)) == 0); + maxLK = (K2 > GMP_NUMB_BITS) ? K2 : GMP_NUMB_BITS; + M2 = n * GMP_NUMB_BITS >> k; + l = n >> k; + Nprime2 = ((2 * M2 + k + 2 + maxLK) / maxLK) * maxLK; + /* Nprime2 = ceil((2*M2+k+3)/maxLK)*maxLK*/ + nprime2 = Nprime2 / GMP_NUMB_BITS; + + /* we should ensure that nprime2 is a multiple of the next K */ + if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t K3; + for (;;) + { + K3 = (mp_size_t) 1 << mpn_fft_best_k (nprime2, sqr); + if ((nprime2 & (K3 - 1)) == 0) + break; + nprime2 = (nprime2 + K3 - 1) & -K3; + Nprime2 = nprime2 * GMP_LIMB_BITS; + /* warning: since nprime2 changed, K3 may change too! */ + } + } + ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */ + + Mp2 = Nprime2 >> k; + + Ap = TMP_BALLOC_MP_PTRS (K2); + Bp = TMP_BALLOC_MP_PTRS (K2); + A = TMP_BALLOC_LIMBS (2 * (nprime2 + 1) << k); + T = TMP_BALLOC_LIMBS (2 * (nprime2 + 1)); + B = A + ((nprime2 + 1) << k); + fft_l = TMP_BALLOC_TYPE (k + 1, int *); + tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int); + for (i = 0; i <= k; i++) + { + fft_l[i] = tmp; + tmp += (mp_size_t) 1 << i; + } + + mpn_fft_initl (fft_l, k); + + TRACE (printf ("recurse: %ldx%ld limbs -> %ld times %ldx%ld (%1.2f)\n", n, + n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2)); + for (i = 0; i < K; i++, ap++, bp++) + { + mp_limb_t cy; + mpn_fft_normalize (*ap, n); + if (!sqr) + mpn_fft_normalize (*bp, n); + + mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T); + if (!sqr) + mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T); + + cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2, + l, Mp2, fft_l, T, sqr); + (*ap)[n] = cy; + } + } +#if ! TUNE_PROGRAM_BUILD + else if (MPN_MULMOD_BKNP1_USABLE (n, k, MUL_FFT_MODF_THRESHOLD)) + { + mp_ptr a; + mp_size_t n_k = n / k; + + if (sqr) + { + mp_ptr tp = TMP_SALLOC_LIMBS (mpn_sqrmod_bknp1_itch (n)); + for (i = 0; i < K; i++) + { + a = *ap++; + mpn_sqrmod_bknp1 (a, a, n_k, k, tp); + } + } + else + { + mp_ptr b, tp = TMP_SALLOC_LIMBS (mpn_mulmod_bknp1_itch (n)); + for (i = 0; i < K; i++) + { + a = *ap++; + b = *bp++; + mpn_mulmod_bknp1 (a, a, b, n_k, k, tp); + } + } + } +#endif + else + { + mp_ptr a, b, tp, tpn; + mp_limb_t cc; + mp_size_t n2 = 2 * n; + tp = TMP_BALLOC_LIMBS (n2); + tpn = tp + n; + TRACE (printf (" mpn_mul_n %ld of %ld limbs\n", K, n)); + for (i = 0; i < K; i++) + { + a = *ap++; + b = *bp++; + if (sqr) + mpn_sqr (tp, a, n); + else + mpn_mul_n (tp, b, a, n); + if (a[n] != 0) + cc = mpn_add_n (tpn, tpn, b, n); + else + cc = 0; + if (b[n] != 0) + cc += mpn_add_n (tpn, tpn, a, n) + a[n]; + if (cc != 0) + { + cc = mpn_add_1 (tp, tp, n2, cc); + /* If mpn_add_1 give a carry (cc != 0), + the result (tp) is at most GMP_NUMB_MAX - 1, + so the following addition can't overflow. + */ + tp[0] += cc; + } + cc = mpn_sub_n (a, tp, tpn, n); + a[n] = 0; + MPN_INCR_U (a, n + 1, cc); + } + } + TMP_FREE; +} + + +/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]] + output: K*A[0] K*A[K-1] ... K*A[1]. + Assumes the Ap[] are pseudo-normalized, i.e. 0 <= Ap[][n] <= 1. + This condition is also fulfilled at exit. +*/ +static void +mpn_fft_fftinv (mp_ptr *Ap, mp_size_t K, mp_size_t omega, mp_size_t n, mp_ptr tp) +{ + if (K == 2) + { + mp_limb_t cy; +#if HAVE_NATIVE_mpn_add_n_sub_n + cy = mpn_add_n_sub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1; +#else + MPN_COPY (tp, Ap[0], n + 1); + mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1); + cy = mpn_sub_n (Ap[1], tp, Ap[1], n + 1); +#endif + if (Ap[0][n] > 1) /* can be 2 or 3 */ + { /* Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1); */ + mp_limb_t cc = Ap[0][n] - 1; + Ap[0][n] = 1; + MPN_DECR_U (Ap[0], n + 1, cc); + } + if (cy) /* Ap[1][n] can be -1 or -2 */ + { /* Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, ~Ap[1][n] + 1); */ + mp_limb_t cc = ~Ap[1][n] + 1; + Ap[1][n] = 0; + MPN_INCR_U (Ap[1], n + 1, cc); + } + } + else + { + mp_size_t j, K2 = K >> 1; + + mpn_fft_fftinv (Ap, K2, 2 * omega, n, tp); + mpn_fft_fftinv (Ap + K2, K2, 2 * omega, n, tp); + /* A[j] <- A[j] + omega^j A[j+K/2] + A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */ + for (j = 0; j < K2; j++, Ap++) + { + /* Ap[K2] <- Ap[0] + Ap[K2] * 2^((j + K2) * omega) + Ap[0] <- Ap[0] + Ap[K2] * 2^(j * omega) */ + mpn_fft_mul_2exp_modF (tp, Ap[K2], j * omega, n); +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_fft_add_sub_modF (Ap[0], Ap[K2], tp, n); +#else + mpn_fft_sub_modF (Ap[K2], Ap[0], tp, n); + mpn_fft_add_modF (Ap[0], Ap[0], tp, n); +#endif + } + } +} + + +/* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */ +static void +mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t k, mp_size_t n) +{ + mp_bitcnt_t i; + + ASSERT (r != a); + i = (mp_bitcnt_t) 2 * n * GMP_NUMB_BITS - k; + mpn_fft_mul_2exp_modF (r, a, i, n); + /* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */ + /* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */ + mpn_fft_normalize (r, n); +} + + +/* {rp,n} <- {ap,an} mod 2^(n*GMP_NUMB_BITS)+1, n <= an <= 3*n. + Returns carry out, i.e. 1 iff {ap,an} = -1 mod 2^(n*GMP_NUMB_BITS)+1, + then {rp,n}=0. +*/ +static mp_size_t +mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_ptr ap, mp_size_t an) +{ + mp_size_t l, m, rpn; + mp_limb_t cc; + + ASSERT ((n <= an) && (an <= 3 * n)); + m = an - 2 * n; + if (m > 0) + { + l = n; + /* add {ap, m} and {ap+2n, m} in {rp, m} */ + cc = mpn_add_n (rp, ap, ap + 2 * n, m); + /* copy {ap+m, n-m} to {rp+m, n-m} */ + rpn = mpn_add_1 (rp + m, ap + m, n - m, cc); + } + else + { + l = an - n; /* l <= n */ + MPN_COPY (rp, ap, n); + rpn = 0; + } + + /* remains to subtract {ap+n, l} from {rp, n+1} */ + rpn -= mpn_sub (rp, rp, n, ap + n, l); + if (rpn < 0) /* necessarily rpn = -1 */ + rpn = mpn_add_1 (rp, rp, n, CNST_LIMB(1)); + return rpn; +} + +/* store in A[0..nprime] the first M bits from {n, nl}, + in A[nprime+1..] the following M bits, ... + Assumes M is a multiple of GMP_NUMB_BITS (M = l * GMP_NUMB_BITS). + T must have space for at least (nprime + 1) limbs. + We must have nl <= 2*K*l. +*/ +static void +mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t K, mp_size_t nprime, + mp_srcptr n, mp_size_t nl, mp_size_t l, mp_size_t Mp, + mp_ptr T) +{ + mp_size_t i, j; + mp_ptr tmp; + mp_size_t Kl = K * l; + TMP_DECL; + TMP_MARK; + + if (nl > Kl) /* normalize {n, nl} mod 2^(Kl*GMP_NUMB_BITS)+1 */ + { + mp_size_t dif = nl - Kl; + + tmp = TMP_BALLOC_LIMBS(Kl + 1); + tmp[Kl] = 0; + +#if ! WANT_OLD_FFT_FULL + ASSERT_ALWAYS (dif <= Kl); +#else + /* The comment "We must have nl <= 2*K*l." says that + ((dif = nl - Kl) > Kl) should never happen. */ + if (UNLIKELY (dif > Kl)) + { + mp_limb_signed_t cy; + int subp = 0; + + cy = mpn_sub_n (tmp, n, n + Kl, Kl); + n += 2 * Kl; + dif -= Kl; + + /* now dif > 0 */ + while (dif > Kl) + { + if (subp) + cy += mpn_sub_n (tmp, tmp, n, Kl); + else + cy -= mpn_add_n (tmp, tmp, n, Kl); + subp ^= 1; + n += Kl; + dif -= Kl; + } + /* now dif <= Kl */ + if (subp) + cy += mpn_sub (tmp, tmp, Kl, n, dif); + else + cy -= mpn_add (tmp, tmp, Kl, n, dif); + if (cy >= 0) + MPN_INCR_U (tmp, Kl + 1, cy); + else + { + tmp[Kl] = 1; + MPN_DECR_U (tmp, Kl + 1, -cy - 1); + } + } + else /* dif <= Kl, i.e. nl <= 2 * Kl */ +#endif + { + mp_limb_t cy; + cy = mpn_sub (tmp, n, Kl, n + Kl, dif); + MPN_INCR_U (tmp, Kl + 1, cy); + } + nl = Kl + 1; + n = tmp; + } + for (i = 0; i < K; i++) + { + Ap[i] = A; + /* store the next M bits of n into A[0..nprime] */ + if (nl > 0) /* nl is the number of remaining limbs */ + { + j = (l <= nl && i < K - 1) ? l : nl; /* store j next limbs */ + nl -= j; + MPN_COPY (T, n, j); + MPN_ZERO (T + j, nprime + 1 - j); + n += l; + mpn_fft_mul_2exp_modF (A, T, i * Mp, nprime); + } + else + MPN_ZERO (A, nprime + 1); + A += nprime + 1; + } + ASSERT_ALWAYS (nl == 0); + TMP_FREE; +} + +/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS + op is pl limbs, its high bit is returned. + One must have pl = mpn_fft_next_size (pl, k). + T must have space for 2 * (nprime + 1) limbs. +*/ + +static mp_limb_t +mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k, + mp_ptr *Ap, mp_ptr *Bp, mp_ptr unusedA, mp_ptr B, + mp_size_t nprime, mp_size_t l, mp_size_t Mp, + int **fft_l, mp_ptr T, int sqr) +{ + mp_size_t K, i, pla, lo, sh, j; + mp_ptr p; + mp_limb_t cc; + + K = (mp_size_t) 1 << k; + + /* direct fft's */ + mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T); + if (!sqr) + mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T); + + /* term to term multiplications */ + mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K); + + /* inverse fft's */ + mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T); + + /* division of terms after inverse fft */ + Bp[0] = T + nprime + 1; + mpn_fft_div_2exp_modF (Bp[0], Ap[0], k, nprime); + for (i = 1; i < K; i++) + { + Bp[i] = Ap[i - 1]; + mpn_fft_div_2exp_modF (Bp[i], Ap[i], k + (K - i) * Mp, nprime); + } + + /* addition of terms in result p */ + MPN_ZERO (T, nprime + 1); + pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */ + p = B; /* B has K*(n' + 1) limbs, which is >= pla, i.e. enough */ + MPN_ZERO (p, pla); + cc = 0; /* will accumulate the (signed) carry at p[pla] */ + for (i = K - 1, lo = l * i + nprime,sh = l * i; i >= 0; i--,lo -= l,sh -= l) + { + mp_ptr n = p + sh; + + j = (K - i) & (K - 1); + + cc += mpn_add (n, n, pla - sh, Bp[j], nprime + 1); + T[2 * l] = i + 1; /* T = (i + 1)*2^(2*M) */ + if (mpn_cmp (Bp[j], T, nprime + 1) > 0) + { /* subtract 2^N'+1 */ + cc -= mpn_sub_1 (n, n, pla - sh, CNST_LIMB(1)); + cc -= mpn_sub_1 (p + lo, p + lo, pla - lo, CNST_LIMB(1)); + } + } + if (cc == -CNST_LIMB(1)) + { + if ((cc = mpn_add_1 (p + pla - pl, p + pla - pl, pl, CNST_LIMB(1)))) + { + /* p[pla-pl]...p[pla-1] are all zero */ + mpn_sub_1 (p + pla - pl - 1, p + pla - pl - 1, pl + 1, CNST_LIMB(1)); + mpn_sub_1 (p + pla - 1, p + pla - 1, 1, CNST_LIMB(1)); + } + } + else if (cc == 1) + { + if (pla >= 2 * pl) + { + while ((cc = mpn_add_1 (p + pla - 2 * pl, p + pla - 2 * pl, 2 * pl, cc))) + ; + } + else + { + MPN_DECR_U (p + pla - pl, pl, cc); + } + } + else + ASSERT (cc == 0); + + /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ] + < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ] + < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */ + return mpn_fft_norm_modF (op, pl, p, pla); +} + +/* return the lcm of a and 2^k */ +static mp_bitcnt_t +mpn_mul_fft_lcm (mp_bitcnt_t a, int k) +{ + mp_bitcnt_t l = k; + + while (a % 2 == 0 && k > 0) + { + a >>= 1; + k --; + } + return a << l; +} + + +mp_limb_t +mpn_mul_fft (mp_ptr op, mp_size_t pl, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml, + int k) +{ + int i; + mp_size_t K, maxLK; + mp_size_t N, Nprime, nprime, M, Mp, l; + mp_ptr *Ap, *Bp, A, T, B; + int **fft_l, *tmp; + int sqr = (n == m && nl == ml); + mp_limb_t h; + TMP_DECL; + + TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", pl, nl, ml, k)); + ASSERT_ALWAYS (mpn_fft_next_size (pl, k) == pl); + + TMP_MARK; + N = pl * GMP_NUMB_BITS; + fft_l = TMP_BALLOC_TYPE (k + 1, int *); + tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int); + for (i = 0; i <= k; i++) + { + fft_l[i] = tmp; + tmp += (mp_size_t) 1 << i; + } + + mpn_fft_initl (fft_l, k); + K = (mp_size_t) 1 << k; + M = N >> k; /* N = 2^k M */ + l = 1 + (M - 1) / GMP_NUMB_BITS; + maxLK = mpn_mul_fft_lcm (GMP_NUMB_BITS, k); /* lcm (GMP_NUMB_BITS, 2^k) */ + + Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; + /* Nprime = ceil((2*M+k+3)/maxLK)*maxLK; */ + nprime = Nprime / GMP_NUMB_BITS; + TRACE (printf ("N=%ld K=%ld, M=%ld, l=%ld, maxLK=%ld, Np=%ld, np=%ld\n", + N, K, M, l, maxLK, Nprime, nprime)); + /* we should ensure that recursively, nprime is a multiple of the next K */ + if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t K2; + for (;;) + { + K2 = (mp_size_t) 1 << mpn_fft_best_k (nprime, sqr); + if ((nprime & (K2 - 1)) == 0) + break; + nprime = (nprime + K2 - 1) & -K2; + Nprime = nprime * GMP_LIMB_BITS; + /* warning: since nprime changed, K2 may change too! */ + } + TRACE (printf ("new maxLK=%ld, Np=%ld, np=%ld\n", maxLK, Nprime, nprime)); + } + ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */ + + T = TMP_BALLOC_LIMBS (2 * (nprime + 1)); + Mp = Nprime >> k; + + TRACE (printf ("%ldx%ld limbs -> %ld times %ldx%ld limbs (%1.2f)\n", + pl, pl, K, nprime, nprime, 2.0 * (double) N / Nprime / K); + printf (" temp space %ld\n", 2 * K * (nprime + 1))); + + A = TMP_BALLOC_LIMBS (K * (nprime + 1)); + Ap = TMP_BALLOC_MP_PTRS (K); + Bp = TMP_BALLOC_MP_PTRS (K); + mpn_mul_fft_decompose (A, Ap, K, nprime, n, nl, l, Mp, T); + if (sqr) + { + mp_size_t pla; + pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */ + B = TMP_BALLOC_LIMBS (pla); + } + else + { + B = TMP_BALLOC_LIMBS (K * (nprime + 1)); + mpn_mul_fft_decompose (B, Bp, K, nprime, m, ml, l, Mp, T); + } + h = mpn_mul_fft_internal (op, pl, k, Ap, Bp, A, B, nprime, l, Mp, fft_l, T, sqr); + + TMP_FREE; + return h; +} + +#if WANT_OLD_FFT_FULL +/* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml} */ +void +mpn_mul_fft_full (mp_ptr op, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml) +{ + mp_ptr pad_op; + mp_size_t pl, pl2, pl3, l; + mp_size_t cc, c2, oldcc; + int k2, k3; + int sqr = (n == m && nl == ml); + + pl = nl + ml; /* total number of limbs of the result */ + + /* perform a fft mod 2^(2N)+1 and one mod 2^(3N)+1. + We must have pl3 = 3/2 * pl2, with pl2 a multiple of 2^k2, and + pl3 a multiple of 2^k3. Since k3 >= k2, both are multiples of 2^k2, + and pl2 must be an even multiple of 2^k2. Thus (pl2,pl3) = + (2*j*2^k2,3*j*2^k2), which works for 3*j <= pl/2^k2 <= 5*j. + We need that consecutive intervals overlap, i.e. 5*j >= 3*(j+1), + which requires j>=2. Thus this scheme requires pl >= 6 * 2^FFT_FIRST_K. */ + + /* ASSERT_ALWAYS(pl >= 6 * (1 << FFT_FIRST_K)); */ + + pl2 = (2 * pl - 1) / 5; /* ceil (2pl/5) - 1 */ + do + { + pl2++; + k2 = mpn_fft_best_k (pl2, sqr); /* best fft size for pl2 limbs */ + pl2 = mpn_fft_next_size (pl2, k2); + pl3 = 3 * pl2 / 2; /* since k>=FFT_FIRST_K=4, pl2 is a multiple of 2^4, + thus pl2 / 2 is exact */ + k3 = mpn_fft_best_k (pl3, sqr); + } + while (mpn_fft_next_size (pl3, k3) != pl3); + + TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl2=%ld pl3=%ld k=%d\n", + nl, ml, pl2, pl3, k2)); + + ASSERT_ALWAYS(pl3 <= pl); + cc = mpn_mul_fft (op, pl3, n, nl, m, ml, k3); /* mu */ + ASSERT(cc == 0); + pad_op = __GMP_ALLOCATE_FUNC_LIMBS (pl2); + cc = mpn_mul_fft (pad_op, pl2, n, nl, m, ml, k2); /* lambda */ + cc = -cc + mpn_sub_n (pad_op, pad_op, op, pl2); /* lambda - low(mu) */ + /* 0 <= cc <= 1 */ + ASSERT(0 <= cc && cc <= 1); + l = pl3 - pl2; /* l = pl2 / 2 since pl3 = 3/2 * pl2 */ + c2 = mpn_add_n (pad_op, pad_op, op + pl2, l); + cc = mpn_add_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2) - cc; + ASSERT(-1 <= cc && cc <= 1); + if (cc < 0) + cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc); + ASSERT(0 <= cc && cc <= 1); + /* now lambda-mu = {pad_op, pl2} - cc mod 2^(pl2*GMP_NUMB_BITS)+1 */ + oldcc = cc; +#if HAVE_NATIVE_mpn_add_n_sub_n + c2 = mpn_add_n_sub_n (pad_op + l, pad_op, pad_op, pad_op + l, l); + cc += c2 >> 1; /* carry out from high <- low + high */ + c2 = c2 & 1; /* borrow out from low <- low - high */ +#else + { + mp_ptr tmp; + TMP_DECL; + + TMP_MARK; + tmp = TMP_BALLOC_LIMBS (l); + MPN_COPY (tmp, pad_op, l); + c2 = mpn_sub_n (pad_op, pad_op, pad_op + l, l); + cc += mpn_add_n (pad_op + l, tmp, pad_op + l, l); + TMP_FREE; + } +#endif + c2 += oldcc; + /* first normalize {pad_op, pl2} before dividing by 2: c2 is the borrow + at pad_op + l, cc is the carry at pad_op + pl2 */ + /* 0 <= cc <= 2 */ + cc -= mpn_sub_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2); + /* -1 <= cc <= 2 */ + if (cc > 0) + cc = -mpn_sub_1 (pad_op, pad_op, pl2, (mp_limb_t) cc); + /* now -1 <= cc <= 0 */ + if (cc < 0) + cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc); + /* now {pad_op, pl2} is normalized, with 0 <= cc <= 1 */ + if (pad_op[0] & 1) /* if odd, add 2^(pl2*GMP_NUMB_BITS)+1 */ + cc += 1 + mpn_add_1 (pad_op, pad_op, pl2, CNST_LIMB(1)); + /* now 0 <= cc <= 2, but cc=2 cannot occur since it would give a carry + out below */ + mpn_rshift (pad_op, pad_op, pl2, 1); /* divide by two */ + if (cc) /* then cc=1 */ + pad_op [pl2 - 1] |= (mp_limb_t) 1 << (GMP_NUMB_BITS - 1); + /* now {pad_op,pl2}-cc = (lambda-mu)/(1-2^(l*GMP_NUMB_BITS)) + mod 2^(pl2*GMP_NUMB_BITS) + 1 */ + c2 = mpn_add_n (op, op, pad_op, pl2); /* no need to add cc (is 0) */ + /* since pl2+pl3 >= pl, necessary the extra limbs (including cc) are zero */ + MPN_COPY (op + pl3, pad_op, pl - pl3); + ASSERT_MPN_ZERO_P (pad_op + pl - pl3, pl2 + pl3 - pl); + __GMP_FREE_FUNC_LIMBS (pad_op, pl2); + /* since the final result has at most pl limbs, no carry out below */ + MPN_INCR_U (op + pl2, pl - pl2, (mp_limb_t) c2); +} +#endif diff --git a/gmp-6.3.0/mpn/generic/mul_n.c b/gmp-6.3.0/mpn/generic/mul_n.c new file mode 100644 index 0000000..36bd923 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul_n.c @@ -0,0 +1,96 @@ +/* mpn_mul_n -- multiply natural numbers. + +Copyright 1991, 1993, 1994, 1996-2003, 2005, 2008, 2009 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); + ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); + + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + mpn_mul_basecase (p, a, n, b, n); + } + else if (BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) + { + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t ws[mpn_toom22_mul_itch (MUL_TOOM33_THRESHOLD_LIMIT-1, + MUL_TOOM33_THRESHOLD_LIMIT-1)]; + ASSERT (MUL_TOOM33_THRESHOLD <= MUL_TOOM33_THRESHOLD_LIMIT); + mpn_toom22_mul (p, a, n, b, n, ws); + } + else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (n, n)); + mpn_toom33_mul (p, a, n, b, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (n, n)); + mpn_toom44_mul (p, a, n, b, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom6_mul_n_itch (n)); + mpn_toom6h_mul (p, a, n, b, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD)) + { + mp_ptr ws; + TMP_DECL; + TMP_MARK; + ws = TMP_ALLOC_LIMBS (mpn_toom8_mul_n_itch (n)); + mpn_toom8h_mul (p, a, n, b, n, ws); + TMP_FREE; + } + else + { + /* The current FFT code allocates its own space. That should probably + change. */ + mpn_fft_mul (p, a, n, b, n); + } +} diff --git a/gmp-6.3.0/mpn/generic/mullo_basecase.c b/gmp-6.3.0/mpn/generic/mullo_basecase.c new file mode 100644 index 0000000..9a4cd3d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mullo_basecase.c @@ -0,0 +1,90 @@ +/* mpn_mullo_basecase -- Internal routine to multiply two natural + numbers of length n and return the low part. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright (C) 2000, 2002, 2004, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* FIXME: Should optionally use mpn_mul_2/mpn_addmul_2. */ + +#ifndef MULLO_VARIANT +#define MULLO_VARIANT 2 +#endif + + +#if MULLO_VARIANT == 1 +void +mpn_mullo_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_size_t i; + + mpn_mul_1 (rp, up, n, vp[0]); + + for (i = n - 1; i > 0; i--) + { + vp++; + rp++; + mpn_addmul_1 (rp, up, i, vp[0]); + } +} +#endif + + +#if MULLO_VARIANT == 2 +void +mpn_mullo_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t h; + + h = up[0] * vp[n - 1]; + + if (n != 1) + { + mp_size_t i; + mp_limb_t v0; + + v0 = *vp++; + h += up[n - 1] * v0 + mpn_mul_1 (rp, up, n - 1, v0); + rp++; + + for (i = n - 2; i > 0; i--) + { + v0 = *vp++; + h += up[i] * v0 + mpn_addmul_1 (rp, up, i, v0); + rp++; + } + } + + rp[0] = h; +} +#endif diff --git a/gmp-6.3.0/mpn/generic/mullo_n.c b/gmp-6.3.0/mpn/generic/mullo_n.c new file mode 100644 index 0000000..6f4e7ae --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mullo_n.c @@ -0,0 +1,243 @@ +/* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs + of their products. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS + FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_range_basecase 1 +#define MAYBE_range_toom22 1 +#else +#define MAYBE_range_basecase \ + ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11)) +#define MAYBE_range_toom22 \ + ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) ) +#endif + +/* THINK: The DC strategy uses different constants in different Toom's + ranges. Something smoother? +*/ + +/* + Compute the least significant half of the product {xy,n}*{yp,n}, or + formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). + + Above the given threshold, the Divide and Conquer strategy is used. + The operands are split in two, and a full product plus two mullo + are used to obtain the final result. The more natural strategy is to + split in two halves, but this is far from optimal when a + sub-quadratic multiplication is used. + + Mulders suggests an unbalanced split in favour of the full product, + split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. + + To compute the value of a, we assume that the cost of mullo for a + given size ML(n) is a fraction of the cost of a full product with + same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; + then we can write: + + ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e + + Given a value for e, want to minimise the value of k, i.e. the + function k=(1-a)^e/(1-2*a^e). + + With e=2, the exponent for schoolbook multiplication, the minimum is + given by the values a=1-a=1/2. + + With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), + Mulders compute (1-a) = 0.694... and we approximate a with 11/36. + + Other possible approximations follow: + e=log(5)/log(3) [Toom-3] -> a ~= 9/40 + e=log(7)/log(4) [Toom-4] -> a ~= 7/39 + e=log(11)/log(6) [Toom-6] -> a ~= 1/8 + e=log(15)/log(8) [Toom-8] -> a ~= 1/10 + + The values above where obtained with the following trivial commands + in the gp-pari shell: + +fun(e,a)=(1-a)^e/(1-2*a^e) +mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)= 2); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); + ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); + + /* Divide-and-conquer */ + + /* We need fractional approximation of the value 0 < a <= 1/2 + giving the minimum in the function k=(1-a)^e/(1-2*a^e). + */ + if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11))) + n1 = n >> 1; + else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11))) + n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ + else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9))) + n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ + else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9)) + n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ + /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ + else + n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ + + n2 = n - n1; + + /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0, + y = y1 2^(n2 GMP_NUMB_BITS) + y0 */ + + /* x0 * y0 */ + mpn_mul_n (tp, xp, yp, n2); + MPN_COPY (rp, tp, n2); + + /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */ + if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) + mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1); + else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) + mpn_mullo_basecase (tp + n, xp + n2, yp, n1); + else + mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n); + mpn_add_n (rp + n2, tp + n2, tp + n, n1); + + /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */ + if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) + mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1); + else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) + mpn_mullo_basecase (tp + n, xp, yp + n2, n1); + else + mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n); + mpn_add_n (rp + n2, rp + n2, tp + n, n1); +} + +/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ +#define MUL_BASECASE_ALLOC \ + (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT) + +/* FIXME: This function should accept a temporary area; dc_mullow_n + accepts a pointer tp, and handle the case tp == rp, do the same here. + Maybe recombine the two functions. + THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase + (typically thanks to mpn_addmul_2) should we unconditionally use + mpn_mul_n? +*/ + +void +mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); + + if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD)) + { + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t tp[MUL_BASECASE_ALLOC]; + mpn_mul_basecase (tp, xp, n, yp, n); + MPN_COPY (rp, tp, n); + } + else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD)) + { + mpn_mullo_basecase (rp, xp, yp, n); + } + else + { + mp_ptr tp; + TMP_DECL; + TMP_MARK; + tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n)); + if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD)) + { + mpn_dc_mullo_n (rp, xp, yp, n, tp); + } + else + { + /* For really large operands, use plain mpn_mul_n but throw away upper n + limbs of result. */ +#if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD) + mpn_fft_mul (tp, xp, n, yp, n); +#else + mpn_mul_n (tp, xp, yp, n); +#endif + MPN_COPY (rp, tp, n); + } + TMP_FREE; + } +} diff --git a/gmp-6.3.0/mpn/generic/mulmid.c b/gmp-6.3.0/mpn/generic/mulmid.c new file mode 100644 index 0000000..f35c5fb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmid.c @@ -0,0 +1,255 @@ +/* mpn_mulmid -- middle product + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#define CHUNK (200 + MULMID_TOOM42_THRESHOLD) + + +void +mpn_mulmid (mp_ptr rp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn) +{ + mp_size_t rn, k; + mp_ptr scratch, temp; + + ASSERT (an >= bn); + ASSERT (bn >= 1); + ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an)); + ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn)); + + if (bn < MULMID_TOOM42_THRESHOLD) + { + /* region not tall enough to make toom42 worthwhile for any portion */ + + if (an < CHUNK) + { + /* region not too wide either, just call basecase directly */ + mpn_mulmid_basecase (rp, ap, an, bp, bn); + return; + } + + /* Region quite wide. For better locality, use basecase on chunks: + + AAABBBCC.. + .AAABBBCC. + ..AAABBBCC + */ + + k = CHUNK - bn + 1; /* number of diagonals per chunk */ + + /* first chunk (marked A in the above diagram) */ + mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn); + + /* remaining chunks (B, C, etc) */ + an -= k; + + while (an >= CHUNK) + { + mp_limb_t t0, t1, cy; + ap += k, rp += k; + t0 = rp[0], t1 = rp[1]; + mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn); + ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */ + MPN_INCR_U (rp + 1, k + 1, t1 + cy); + an -= k; + } + + if (an >= bn) + { + /* last remaining chunk */ + mp_limb_t t0, t1, cy; + ap += k, rp += k; + t0 = rp[0], t1 = rp[1]; + mpn_mulmid_basecase (rp, ap, an, bp, bn); + ADDC_LIMB (cy, rp[0], rp[0], t0); + MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy); + } + + return; + } + + /* region is tall enough for toom42 */ + + rn = an - bn + 1; + + if (rn < MULMID_TOOM42_THRESHOLD) + { + /* region not wide enough to make toom42 worthwhile for any portion */ + + TMP_DECL; + + if (bn < CHUNK) + { + /* region not too tall either, just call basecase directly */ + mpn_mulmid_basecase (rp, ap, an, bp, bn); + return; + } + + /* Region quite tall. For better locality, use basecase on chunks: + + AAAAA.... + .AAAAA... + ..BBBBB.. + ...BBBBB. + ....CCCCC + */ + + TMP_MARK; + + temp = TMP_ALLOC_LIMBS (rn + 2); + + /* first chunk (marked A in the above diagram) */ + bp += bn - CHUNK, an -= bn - CHUNK; + mpn_mulmid_basecase (rp, ap, an, bp, CHUNK); + + /* remaining chunks (B, C, etc) */ + bn -= CHUNK; + + while (bn >= CHUNK) + { + ap += CHUNK, bp -= CHUNK; + mpn_mulmid_basecase (temp, ap, an, bp, CHUNK); + mpn_add_n (rp, rp, temp, rn + 2); + bn -= CHUNK; + } + + if (bn) + { + /* last remaining chunk */ + ap += CHUNK, bp -= bn; + mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn); + mpn_add_n (rp, rp, temp, rn + 2); + } + + TMP_FREE; + return; + } + + /* we're definitely going to use toom42 somewhere */ + + if (bn > rn) + { + /* slice region into chunks, use toom42 on all chunks except possibly + the last: + + AA.... + .AA... + ..BB.. + ...BB. + ....CC + */ + + TMP_DECL; + TMP_MARK; + + temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn)); + scratch = temp + rn + 2; + + /* first chunk (marked A in the above diagram) */ + bp += bn - rn; + mpn_toom42_mulmid (rp, ap, bp, rn, scratch); + + /* remaining chunks (B, C, etc) */ + bn -= rn; + + while (bn >= rn) + { + ap += rn, bp -= rn; + mpn_toom42_mulmid (temp, ap, bp, rn, scratch); + mpn_add_n (rp, rp, temp, rn + 2); + bn -= rn; + } + + if (bn) + { + /* last remaining chunk */ + ap += rn, bp -= bn; + mpn_mulmid (temp, ap, rn + bn - 1, bp, bn); + mpn_add_n (rp, rp, temp, rn + 2); + } + + TMP_FREE; + } + else + { + /* slice region into chunks, use toom42 on all chunks except possibly + the last: + + AAABBBCC.. + .AAABBBCC. + ..AAABBBCC + */ + + TMP_DECL; + TMP_MARK; + + scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn)); + + /* first chunk (marked A in the above diagram) */ + mpn_toom42_mulmid (rp, ap, bp, bn, scratch); + + /* remaining chunks (B, C, etc) */ + rn -= bn; + + while (rn >= bn) + { + mp_limb_t t0, t1, cy; + ap += bn, rp += bn; + t0 = rp[0], t1 = rp[1]; + mpn_toom42_mulmid (rp, ap, bp, bn, scratch); + ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */ + MPN_INCR_U (rp + 1, bn + 1, t1 + cy); + rn -= bn; + } + + TMP_FREE; + + if (rn) + { + /* last remaining chunk */ + mp_limb_t t0, t1, cy; + ap += bn, rp += bn; + t0 = rp[0], t1 = rp[1]; + mpn_mulmid (rp, ap, rn + bn - 1, bp, bn); + ADDC_LIMB (cy, rp[0], rp[0], t0); + MPN_INCR_U (rp + 1, rn + 1, t1 + cy); + } + } +} diff --git a/gmp-6.3.0/mpn/generic/mulmid_basecase.c b/gmp-6.3.0/mpn/generic/mulmid_basecase.c new file mode 100644 index 0000000..d5434ea --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmid_basecase.c @@ -0,0 +1,82 @@ +/* mpn_mulmid_basecase -- classical middle product algorithm + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +/* Middle product of {up,un} and {vp,vn}, write result to {rp,un-vn+3}. + Must have un >= vn >= 1. + + Neither input buffer may overlap with the output buffer. */ + +void +mpn_mulmid_basecase (mp_ptr rp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +{ + mp_limb_t lo, hi; /* last two limbs of output */ + mp_limb_t cy; + + ASSERT (un >= vn); + ASSERT (vn >= 1); + ASSERT (! MPN_OVERLAP_P (rp, un - vn + 3, up, un)); + ASSERT (! MPN_OVERLAP_P (rp, un - vn + 3, vp, vn)); + + up += vn - 1; + un -= vn - 1; + + /* multiply by first limb, store result */ + lo = mpn_mul_1 (rp, up, un, vp[0]); + hi = 0; + + /* accumulate remaining rows */ + for (vn--; vn; vn--) + { + up--, vp++; + cy = mpn_addmul_1 (rp, up, un, vp[0]); + add_ssaaaa (hi, lo, hi, lo, CNST_LIMB(0), cy); + } + + /* store final limbs */ +#if GMP_NAIL_BITS != 0 + hi = (hi << GMP_NAIL_BITS) + (lo >> GMP_NUMB_BITS); + lo &= GMP_NUMB_MASK; +#endif + + rp[un] = lo; + rp[un + 1] = hi; +} diff --git a/gmp-6.3.0/mpn/generic/mulmid_n.c b/gmp-6.3.0/mpn/generic/mulmid_n.c new file mode 100644 index 0000000..ac7e8f1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmid_n.c @@ -0,0 +1,61 @@ +/* mpn_mulmid_n -- balanced middle product + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +void +mpn_mulmid_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); + + if (n < MULMID_TOOM42_THRESHOLD) + { + mpn_mulmid_basecase (rp, ap, 2*n - 1, bp, n); + } + else + { + mp_ptr scratch; + TMP_DECL; + TMP_MARK; + scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (n)); + mpn_toom42_mulmid (rp, ap, bp, n, scratch); + TMP_FREE; + } +} diff --git a/gmp-6.3.0/mpn/generic/mulmod_bknp1.c b/gmp-6.3.0/mpn/generic/mulmod_bknp1.c new file mode 100644 index 0000000..feb10eb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmod_bknp1.c @@ -0,0 +1,502 @@ +/* Mulptiplication mod B^n+1, for small operands. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2020-2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#ifndef MOD_BKNP1_USE11 +#define MOD_BKNP1_USE11 ((GMP_NUMB_BITS % 8 != 0) && (GMP_NUMB_BITS % 2 == 0)) +#endif +#ifndef MOD_BKNP1_ONLY3 +#define MOD_BKNP1_ONLY3 0 +#endif + +/* {rp, (k - 1) * n} = {op, k * n + 1} % (B^{k*n}+1) / (B^n+1) */ +static void +_mpn_modbknp1dbnp1_n (mp_ptr rp, mp_srcptr op, mp_size_t n, unsigned k) +{ + mp_limb_t hl; + mp_srcptr hp; + unsigned i; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + ASSERT (k > 2); + ASSERT (k % 2 == 1); + + --k; + + rp += k * n; + op += k * n; + hp = op; + hl = hp[n]; /* initial op[k*n]. */ + ASSERT (hl < GMP_NUMB_MAX - 1); + +#if MOD_BKNP1_ONLY3 == 0 + /* The first MPN_INCR_U (rp + n, 1, cy); in the loop should be + rp[n] = cy; */ + *rp = 0; +#endif + + i = k >> 1; + do + { + mp_limb_t cy, bw; + rp -= n; + op -= n; + cy = hl + mpn_add_n (rp, op, hp, n); +#if MOD_BKNP1_ONLY3 + rp[n] = cy; +#else + MPN_INCR_U (rp + n, (k - i * 2) * n + 1, cy); +#endif + rp -= n; + op -= n; + bw = hl + mpn_sub_n (rp, op, hp, n); + MPN_DECR_U (rp + n, (k - i * 2 + 1) * n + 1, bw); + } + while (--i != 0); + + for (; (hl = *(rp += k * n)) != 0; ) /* Should run only once... */ + { + *rp = 0; + i = k >> 1; + do + { + rp -= n; + MPN_INCR_U (rp, (k - i * 2 + 1) * n + 1, hl); + rp -= n; + MPN_DECR_U (rp, (k - i * 2 + 2) * n + 1, hl); + } + while (--i != 0); + } +} + +static void +_mpn_modbnp1_pn_ip (mp_ptr r, mp_size_t n, mp_limb_t h) +{ + ASSERT (r[n] == h); + + /* Fully normalise */ + MPN_DECR_U (r, n + 1, h); + h -= r[n]; + r[n] = 0; + MPN_INCR_U (r, n + 1, h); +} + +static void +_mpn_modbnp1_neg_ip (mp_ptr r, mp_size_t n, mp_limb_t h) +{ + r[n] = 0; + MPN_INCR_U (r, n + 1, -h); + if (UNLIKELY (r[n] != 0)) + _mpn_modbnp1_pn_ip (r, n, 1); +} + +static void +_mpn_modbnp1_nc_ip (mp_ptr r, mp_size_t n, mp_limb_t h) +{ + if (h & GMP_NUMB_HIGHBIT) /* This means h < 0 */ + { + _mpn_modbnp1_neg_ip (r, n, h); + } + else + { + r[n] = h; + if (h) + _mpn_modbnp1_pn_ip(r, n, h); + } +} + +/* {rp, rn + 1} = {op, on} mod (B^{rn}+1) */ +/* Used when rn < on < 2*rn. */ +static void +_mpn_modbnp1 (mp_ptr rp, mp_size_t rn, mp_srcptr op, mp_size_t on) +{ + mp_limb_t bw; + +#if 0 + if (UNLIKELY (on <= rn)) + { + MPN_COPY (rp, op, on); + MPN_ZERO (rp + on, rn - on); + return; + } +#endif + + ASSERT (on > rn); + ASSERT (on <= 2 * rn); + + bw = mpn_sub (rp, op, rn, op + rn, on - rn); + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, bw); +} + +/* {rp, rn + 1} = {op, k * rn + 1} % (B^{rn}+1) */ +/* With odd k >= 3. */ +static void +_mpn_modbnp1_kn (mp_ptr rp, mp_srcptr op, mp_size_t rn, unsigned k) +{ + mp_limb_t cy; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + ASSERT (k & 1); + k >>= 1; + ASSERT (0 < k && k < GMP_NUMB_HIGHBIT - 3); + ASSERT (op[(1 + 2 * k) * rn] < GMP_NUMB_HIGHBIT - 2 - k); + + cy = - mpn_sub_n (rp, op, op + rn, rn); + for (;;) { + op += 2 * rn; + cy += mpn_add_n (rp, rp, op, rn); + if (--k == 0) + break; + cy -= mpn_sub_n (rp, rp, op + rn, rn); + }; + + cy += op[rn]; + _mpn_modbnp1_nc_ip (rp, rn, cy); +} + +/* For the various mpn_divexact_byN here, fall back to using either + mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is + faster if it is native. For now, since mpn_divexact_1 is native on + platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use + mpn_pi1_bdiv_q_1 unconditionally. FIXME. */ + +#ifndef mpn_divexact_by5 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_5 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 5 * 3 << 3) + 5) & GMP_NUMB_MAX) +#define mpn_divexact_by5(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,5,BINVERT_5,0) +#else +#define mpn_divexact_by5(dst,src,size) mpn_divexact_1(dst,src,size,5) +#endif +#endif + +#ifndef mpn_divexact_by7 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_7 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 3)) / 7 * 3 << 4) + 7) & GMP_NUMB_MAX) +#define mpn_divexact_by7(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,7,BINVERT_7,0) +#else +#define mpn_divexact_by7(dst,src,size) mpn_divexact_1(dst,src,size,7) +#endif +#endif + +#ifndef mpn_divexact_by11 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_11 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 10)) / 11 << 5) + 3) & GMP_NUMB_MAX) +#define mpn_divexact_by11(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,11,BINVERT_11,0) +#else +#define mpn_divexact_by11(dst,src,size) mpn_divexact_1(dst,src,size,11) +#endif +#endif + +#ifndef mpn_divexact_by13 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_13 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 12)) / 13 * 3 << 14) + 3781) & GMP_NUMB_MAX) +#define mpn_divexact_by13(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,13,BINVERT_13,0) +#else +#define mpn_divexact_by13(dst,src,size) mpn_divexact_1(dst,src,size,13) +#endif +#endif + +#ifndef mpn_divexact_by17 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_17 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 8)) / 17 * 15 << 7) + 113) & GMP_NUMB_MAX) +#define mpn_divexact_by17(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,17,BINVERT_17,0) +#else +#define mpn_divexact_by17(dst,src,size) mpn_divexact_1(dst,src,size,17) +#endif +#endif + +/* Thanks to Chinese remainder theorem, store + in {rp, k*n+1} the value mod (B^(k*n)+1), given + {ap, k*n+1} mod ((B^(k*n)+1)/(B^n+1)) and + {bp, n+1} mod (B^n+1) . + {tp, n+1} is a scratch area. + tp == rp or rp == ap are possible. +*/ +static void +_mpn_crt (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, + mp_size_t n, unsigned k, mp_ptr tp) +{ + mp_limb_t mod; + unsigned i; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + _mpn_modbnp1_kn (tp, ap, n, k); + if (mpn_sub_n (tp, bp, tp, n + 1)) + _mpn_modbnp1_neg_ip (tp, n, tp[n]); + +#if MOD_BKNP1_USE11 + if (UNLIKELY (k == 11)) + { + ASSERT (GMP_NUMB_BITS % 2 == 0); + /* mod <- -Mod(B^n+1,11)^-1 */ + mod = n * (GMP_NUMB_BITS % 5) % 5; + if ((mod > 2) || UNLIKELY (mod == 0)) + mod += 5; + + mod *= mpn_mod_1 (tp, n + 1, 11); + } + else +#endif + { +#if GMP_NUMB_BITS % 8 == 0 + /* (2^6 - 1) | (2^{GMP_NUMB_BITS*3/4} - 1) */ + /* (2^6 - 1) = 3^2 * 7 */ + mod = mpn_mod_34lsub1 (tp, n + 1); + ASSERT ((GMP_NUMB_MAX >> (GMP_NUMB_BITS >> 2)) % k == 0); + /* (2^12 - 1) = 3^2 * 5 * 7 * 13 */ + /* (2^24 - 1) = 3^2 * 5 * 7 * 13 * 17 * 241 */ + ASSERT (k == 3 || k == 5 || k == 7 || k == 13 || k == 17); + +#if GMP_NUMB_BITS % 3 != 0 + if (UNLIKELY (k != 3)) + { + ASSERT ((GMP_NUMB_MAX % k == 0) || (n % 3 != 0)); + if ((GMP_NUMB_BITS % 16 == 0) && LIKELY (k == 5)) + mod <<= 1; /* k >> 1 = 1 << 1 */ + else if ((GMP_NUMB_BITS % 16 != 0) || LIKELY (k == 7)) + mod <<= (n << (GMP_NUMB_BITS % 3 >> 1)) % 3; + else if ((GMP_NUMB_BITS % 32 != 0) || LIKELY (k == 13)) + mod *= ((n << (GMP_NUMB_BITS % 3 >> 1)) % 3 == 1) ? 3 : 9; + else /* k == 17 */ + mod <<= 3; /* k >> 1 = 1 << 3 */ +#if 0 + if ((GMP_NUMB_BITS == 8) /* && (k == 7) */ || + (GMP_NUMB_BITS == 16) && (k == 13)) + mod = ((mod & (GMP_NUMB_MAX >> (GMP_NUMB_BITS >> 2))) + + (mod >> (3 * GMP_NUMB_BITS >> 2))); +#endif + } +#else + ASSERT (GMP_NUMB_MAX % k == 0); + /* 2^{GMP_NUMB_BITS} - 1 = 0 (mod k) */ + /* 2^{GMP_NUMB_BITS} = 1 (mod k) */ + /* 2^{n*GMP_NUMB_BITS} + 1 = 2 (mod k) */ + /* -2^{-1} = k >> 1 (mod k) */ + mod *= k >> 1; +#endif +#else + ASSERT_ALWAYS (k == 0); /* Not implemented, should not be used. */ +#endif + } + + MPN_INCR_U (tp, n + 1, mod); + tp[n] += mod; + + if (LIKELY (k == 3)) + ASSERT_NOCARRY (mpn_divexact_by3 (tp, tp, n + 1)); + else if ((GMP_NUMB_BITS % 16 == 0) && LIKELY (k == 5)) + mpn_divexact_by5 (tp, tp, n + 1); + else if (((! MOD_BKNP1_USE11) && (GMP_NUMB_BITS % 16 != 0)) + || LIKELY (k == 7)) + mpn_divexact_by7 (tp, tp, n + 1); +#if MOD_BKNP1_USE11 + else if (k == 11) + mpn_divexact_by11 (tp, tp, n + 1); +#endif + else if ((GMP_NUMB_BITS % 32 != 0) || LIKELY (k == 13)) + mpn_divexact_by13 (tp, tp, n + 1); + else /* (k == 17) */ + mpn_divexact_by17 (tp, tp, n + 1); + + rp += k * n; + ap += k * n; /* tp - 1 */ + + rp -= n; + ap -= n; + ASSERT_NOCARRY (mpn_add_n (rp, ap, tp, n + 1)); + + i = k >> 1; + do + { + mp_limb_t cy, bw; + rp -= n; + ap -= n; + bw = mpn_sub_n (rp, ap, tp, n) + tp[n]; + MPN_DECR_U (rp + n, (k - i * 2) * n + 1, bw); + rp -= n; + ap -= n; + cy = mpn_add_n (rp, ap, tp, n) + tp[n]; + MPN_INCR_U (rp + n, (k - i * 2 + 1) * n + 1, cy); + } + while (--i != 0); + + /* if (LIKELY (rp[k * n])) */ + _mpn_modbnp1_pn_ip (rp, k * n, rp[k * n]); +} + + +static void +_mpn_mulmod_bnp1_tp (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, + mp_ptr tp) +{ + mp_limb_t cy; + unsigned k; + + ASSERT (0 < rn); + ASSERT ((ap[rn] | bp[rn]) <= 1); + + if (UNLIKELY (ap[rn] | bp[rn])) + { + if (ap[rn]) + cy = bp[rn] + mpn_neg (rp, bp, rn); + else /* ap[rn] == 0 */ + cy = mpn_neg (rp, ap, rn); + } + else if (MPN_MULMOD_BKNP1_USABLE(rn, k, MUL_FFT_MODF_THRESHOLD / 3)) + { + rn /= k; + mpn_mulmod_bknp1 (rp, ap, bp, rn, k, tp); + return; + } + else + { + mpn_mul_n (tp, ap, bp, rn); + cy = mpn_sub_n (rp, tp, tp + rn, rn); + } + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, cy); +} + +/* {rp, kn + 1} = {ap, kn + 1} * {bp, kn + 1} % (B^kn + 1) */ +/* tp must point to at least 4*(k-1)*n+1 limbs*/ +void +mpn_mulmod_bknp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, + mp_size_t n, unsigned k, mp_ptr tp) +{ + mp_ptr hp; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + ASSERT (k > 2); + ASSERT (k % 2 == 1); + + /* a % (B^{nn}+1)/(B^{nn/k}+1) */ + _mpn_modbknp1dbnp1_n (tp + (k - 1) * n * 2, ap, n, k); + /* b % (B^{nn}+1)/(B^{nn/k}+1) */ + _mpn_modbknp1dbnp1_n (tp + (k - 1) * n * 3, bp, n, k); + mpn_mul_n (tp, tp + (k - 1) * n * 2, tp + (k - 1) * n * 3, (k - 1) * n); + _mpn_modbnp1 (tp, k * n, tp, (k - 1) * n * 2); + + hp = tp + k * n + 1; + /* a % (B^{nn/k}+1) */ + ASSERT (ap[k * n] <= 1); + _mpn_modbnp1_kn (hp, ap, n, k); + /* b % (B^{nn/k}+1) */ + ASSERT (bp[k * n] <= 1); + _mpn_modbnp1_kn (hp + n + 1, bp, n, k); + _mpn_mulmod_bnp1_tp (hp + (n + 1) * 2, hp, hp + n + 1, n, hp + (n + 1) * 2); + + _mpn_crt (rp, tp, hp + (n + 1) * 2, n, k, hp); +} + + +static void +_mpn_sqrmod_bnp1_tp (mp_ptr rp, mp_srcptr ap, mp_size_t rn, + mp_ptr tp) +{ + mp_limb_t cy; + unsigned k; + + ASSERT (0 < rn); + + if (UNLIKELY (ap[rn])) + { + ASSERT (ap[rn] == 1); + *rp = 1; + MPN_FILL (rp + 1, rn, 0); + return; + } + else if (MPN_SQRMOD_BKNP1_USABLE(rn, k, MUL_FFT_MODF_THRESHOLD / 3)) + { + rn /= k; + mpn_sqrmod_bknp1 (rp, ap, rn, k, tp); + return; + } + else + { + mpn_sqr (tp, ap, rn); + cy = mpn_sub_n (rp, tp, tp + rn, rn); + } + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, cy); +} + +/* {rp, kn + 1} = {ap, kn + 1}^2 % (B^kn + 1) */ +/* tp must point to at least 3*(k-1)*n+1 limbs*/ +void +mpn_sqrmod_bknp1 (mp_ptr rp, mp_srcptr ap, + mp_size_t n, unsigned k, mp_ptr tp) +{ + mp_ptr hp; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + ASSERT (k > 2); + ASSERT (k % 2 == 1); + + /* a % (B^{nn}+1)/(B^{nn/k}+1) */ + _mpn_modbknp1dbnp1_n (tp + (k - 1) * n * 2, ap, n, k); + mpn_sqr (tp, tp + (k - 1) * n * 2, (k - 1) * n); + _mpn_modbnp1 (tp, k * n, tp, (k - 1) * n * 2); + + hp = tp + k * n + 1; + /* a % (B^{nn/k}+1) */ + ASSERT (ap[k * n] <= 1); + _mpn_modbnp1_kn (hp, ap, n, k); + _mpn_sqrmod_bnp1_tp (hp + (n + 1), hp, n, hp + (n + 1)); + + _mpn_crt (rp, tp, hp + (n + 1), n, k, hp); +} diff --git a/gmp-6.3.0/mpn/generic/mulmod_bnm1.c b/gmp-6.3.0/mpn/generic/mulmod_bnm1.c new file mode 100644 index 0000000..8229ede --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmod_bnm1.c @@ -0,0 +1,374 @@ +/* mulmod_bnm1.c -- multiplication mod B^n-1. + + Contributed to the GNU project by Niels Möller, Torbjorn Granlund and + Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012, 2013, 2020, 2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is + mod B^rn - 1, and values are semi-normalised; zero is represented + as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp. + tp==rp is allowed. */ +void +mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, + mp_ptr tp) +{ + mp_limb_t cy; + + ASSERT (0 < rn); + + mpn_mul_n (tp, ap, bp, rn); + cy = mpn_add_n (rp, tp, tp + rn, rn); + /* If cy == 1, then the value of rp is at most B^rn - 2, so there can + * be no overflow when adding in the carry. */ + MPN_INCR_U (rp, rn, cy); +} + + +/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in + normalised representation, computation is mod B^rn + 1. Needs + a scratch area of 2rn limbs at tp; tp == rp is allowed. + Output is normalised. */ +static void +mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, + mp_ptr tp) +{ + mp_limb_t cy; + unsigned k; + + ASSERT (0 < rn); + + if (UNLIKELY (ap[rn] | bp [rn])) + { + if (ap[rn]) + cy = bp [rn] + mpn_neg (rp, bp, rn); + else /* ap[rn] == 0 */ + cy = mpn_neg (rp, ap, rn); + } + else if (MPN_MULMOD_BKNP1_USABLE (rn, k, MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t n_k = rn / k; + TMP_DECL; + + TMP_MARK; + mpn_mulmod_bknp1 (rp, ap, bp, n_k, k, + TMP_ALLOC_LIMBS (mpn_mulmod_bknp1_itch (rn))); + TMP_FREE; + return; + } + else + { + mpn_mul_n (tp, ap, bp, rn); + cy = mpn_sub_n (rp, tp, tp + rn, rn); + } + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, cy); +} + + +/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1) + * + * The result is expected to be ZERO if and only if one of the operand + * already is. Otherwise the class [0] Mod(B^rn-1) is represented by + * B^rn-1. This should not be a problem if mulmod_bnm1 is used to + * combine results and obtain a natural number when one knows in + * advance that the final value is less than (B^rn-1). + * Moreover it should not be a problem if mulmod_bnm1 is used to + * compute the full product with an+bn <= rn, because this condition + * implies (B^an-1)(B^bn-1) < (B^rn-1) . + * + * Requires 0 < bn <= an <= rn and an + bn > rn/2 + * Scratch need: rn + (need for recursive call OR rn + 4). This gives + * + * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4 + */ +void +mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp) +{ + ASSERT (0 < bn); + ASSERT (bn <= an); + ASSERT (an <= rn); + + if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD)) + { + if (UNLIKELY (bn < rn)) + { + if (UNLIKELY (an + bn <= rn)) + { + mpn_mul (rp, ap, an, bp, bn); + } + else + { + mp_limb_t cy; + mpn_mul (tp, ap, an, bp, bn); + cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn); + MPN_INCR_U (rp, rn, cy); + } + } + else + mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp); + } + else + { + mp_size_t n; + mp_limb_t cy; + mp_limb_t hi; + + n = rn >> 1; + + /* We need at least an + bn >= n, to be able to fit one of the + recursive products at rp. Requiring strict inequality makes + the code slightly simpler. If desired, we could avoid this + restriction by initially halving rn as long as rn is even and + an + bn <= rn/2. */ + + ASSERT (an + bn > n); + + /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1) + and crt together as + + x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)] + */ + +#define a0 ap +#define a1 (ap + n) +#define b0 bp +#define b1 (bp + n) + +#define xp tp /* 2n + 2 */ + /* am1 maybe in {xp, n} */ + /* bm1 maybe in {xp + n, n} */ +#define sp1 (tp + 2*n + 2) + /* ap1 maybe in {sp1, n + 1} */ + /* bp1 maybe in {sp1 + n + 1, n + 1} */ + + { + mp_srcptr am1, bm1; + mp_size_t anm, bnm; + mp_ptr so; + + bm1 = b0; + bnm = bn; + if (LIKELY (an > n)) + { + am1 = xp; + cy = mpn_add (xp, a0, n, a1, an - n); + MPN_INCR_U (xp, n, cy); + anm = n; + so = xp + n; + if (LIKELY (bn > n)) + { + bm1 = so; + cy = mpn_add (so, b0, n, b1, bn - n); + MPN_INCR_U (so, n, cy); + bnm = n; + so += n; + } + } + else + { + so = xp; + am1 = a0; + anm = an; + } + + mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so); + } + + { + int k; + mp_srcptr ap1, bp1; + mp_size_t anp, bnp; + + bp1 = b0; + bnp = bn; + if (LIKELY (an > n)) { + ap1 = sp1; + cy = mpn_sub (sp1, a0, n, a1, an - n); + sp1[n] = 0; + MPN_INCR_U (sp1, n + 1, cy); + anp = n + ap1[n]; + if (LIKELY (bn > n)) { + bp1 = sp1 + n + 1; + cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n); + sp1[2*n+1] = 0; + MPN_INCR_U (sp1 + n + 1, n + 1, cy); + bnp = n + bp1[n]; + } + } else { + ap1 = a0; + anp = an; + } + + if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD)) + k=0; + else + { + int mask; + k = mpn_fft_best_k (n, 0); + mask = (1<>=1;}; + } + if (k >= FFT_FIRST_K) + xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k); + else if (UNLIKELY (bp1 == b0)) + { + ASSERT (anp + bnp <= 2*n+1); + ASSERT (anp + bnp > n); + ASSERT (anp >= bnp); + mpn_mul (xp, ap1, anp, bp1, bnp); + anp = anp + bnp - n; + ASSERT (anp <= n || xp[2*n]==0); + anp-= anp > n; + cy = mpn_sub (xp, xp, n, xp + n, anp); + xp[n] = 0; + MPN_INCR_U (xp, n+1, cy); + } + else + mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp); + } + + /* Here the CRT recomposition begins. + + xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1) + Division by 2 is a bitwise rotation. + + Assumes xp normalised mod (B^n+1). + + The residue class [0] is represented by [B^n-1]; except when + both input are ZERO. + */ + +#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc +#if HAVE_NATIVE_mpn_rsh1add_nc + cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */ + hi = cy << (GMP_NUMB_BITS - 1); + cy = 0; + /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi + overflows, i.e. a further increment will not overflow again. */ +#else /* ! _nc */ + cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */ + hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ + cy >>= 1; + /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that + the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */ +#endif +#if GMP_NAIL_BITS == 0 + add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi); +#else + cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1); + rp[n-1] ^= hi; +#endif +#else /* ! HAVE_NATIVE_mpn_rsh1add_n */ +#if HAVE_NATIVE_mpn_add_nc + cy = mpn_add_nc(rp, rp, xp, n, xp[n]); +#else /* ! _nc */ + cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */ +#endif + cy += (rp[0]&1); + mpn_rshift(rp, rp, n, 1); + ASSERT (cy <= 2); + hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ + cy >>= 1; + /* We can have cy != 0 only if hi = 0... */ + ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0); + rp[n-1] |= hi; + /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */ +#endif + ASSERT (cy <= 1); + /* Next increment can not overflow, read the previous comments about cy. */ + ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0)); + MPN_INCR_U(rp, n, cy); + + /* Compute the highest half: + ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n + */ + if (UNLIKELY (an + bn < rn)) + { + /* Note that in this case, the only way the result can equal + zero mod B^{rn} - 1 is if one of the inputs is zero, and + then the output of both the recursive calls and this CRT + reconstruction is zero, not B^{rn} - 1. Which is good, + since the latter representation doesn't fit in the output + area.*/ + cy = mpn_sub_n (rp + n, rp, xp, an + bn - n); + + /* FIXME: This subtraction of the high parts is not really + necessary, we do it to get the carry out, and for sanity + checking. */ + cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n, + xp + an + bn - n, rn - (an + bn), cy); + ASSERT (an + bn == rn - 1 || + mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn))); + cy = mpn_sub_1 (rp, rp, an + bn, cy); + ASSERT (cy == (xp + an + bn - n)[0]); + } + else + { + cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n); + /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO. + DECR will affect _at most_ the lowest n limbs. */ + MPN_DECR_U (rp, 2*n, cy); + } +#undef a0 +#undef a1 +#undef b0 +#undef b1 +#undef xp +#undef sp1 + } +} + +mp_size_t +mpn_mulmod_bnm1_next_size (mp_size_t n) +{ + mp_size_t nh; + + if (BELOW_THRESHOLD (n, MULMOD_BNM1_THRESHOLD)) + return n; + if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1)) + return (n + (2-1)) & (-2); + if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1)) + return (n + (4-1)) & (-4); + + nh = (n + 1) >> 1; + + if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD)) + return (n + (8-1)) & (-8); + + return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0)); +} diff --git a/gmp-6.3.0/mpn/generic/neg.c b/gmp-6.3.0/mpn/generic/neg.c new file mode 100644 index 0000000..bec2a32 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/neg.c @@ -0,0 +1,33 @@ +/* mpn_neg - negate an mpn. + +Copyright 2001, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_neg 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/nussbaumer_mul.c b/gmp-6.3.0/mpn/generic/nussbaumer_mul.c new file mode 100644 index 0000000..3e0cf27 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/nussbaumer_mul.c @@ -0,0 +1,70 @@ +/* mpn_nussbaumer_mul -- Multiply {ap,an} and {bp,bn} using + Nussbaumer's negacyclic convolution. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Multiply {ap,an} by {bp,bn}, and put the result in {pp, an+bn} */ +void +mpn_nussbaumer_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn) +{ + mp_size_t rn; + mp_ptr tp; + TMP_DECL; + + ASSERT (an >= bn); + ASSERT (bn > 0); + + TMP_MARK; + + if ((ap == bp) && (an == bn)) + { + rn = mpn_sqrmod_bnm1_next_size (2*an); + tp = TMP_ALLOC_LIMBS (mpn_sqrmod_bnm1_itch (rn, an)); + mpn_sqrmod_bnm1 (pp, rn, ap, an, tp); + } + else + { + rn = mpn_mulmod_bnm1_next_size (an + bn); + tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (rn, an, bn)); + mpn_mulmod_bnm1 (pp, rn, ap, an, bp, bn, tp); + } + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/perfpow.c b/gmp-6.3.0/mpn/generic/perfpow.c new file mode 100644 index 0000000..9d46477 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/perfpow.c @@ -0,0 +1,342 @@ +/* mpn_perfect_power_p -- mpn perfect power detection. + + Contributed to the GNU project by Martin Boij. + +Copyright 2009, 2010, 2012, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#define SMALL 20 +#define MEDIUM 100 + +/* Return non-zero if {np,nn} == {xp,xn} ^ k. + Algorithm: + For s = 1, 2, 4, ..., s_max, compute the s least significant limbs of + {xp,xn}^k. Stop if they don't match the s least significant limbs of + {np,nn}. + + FIXME: Low xn limbs can be expected to always match, if computed as a mod + B^{xn} root. So instead of using mpn_powlo, compute an approximation of the + most significant (normalized) limb of {xp,xn} ^ k (and an error bound), and + compare to {np, nn}. Or use an even cruder approximation based on fix-point + base 2 logarithm. */ +static int +pow_equals (mp_srcptr np, mp_size_t n, + mp_srcptr xp,mp_size_t xn, + mp_limb_t k, mp_bitcnt_t f, + mp_ptr tp) +{ + mp_bitcnt_t y, z; + mp_size_t bn; + mp_limb_t h, l; + + ASSERT (n > 1 || (n == 1 && np[0] > 1)); + ASSERT (np[n - 1] > 0); + ASSERT (xn > 0); + + if (xn == 1 && xp[0] == 1) + return 0; + + z = 1 + (n >> 1); + for (bn = 1; bn < z; bn <<= 1) + { + mpn_powlo (tp, xp, &k, 1, bn, tp + bn); + if (mpn_cmp (tp, np, bn) != 0) + return 0; + } + + /* Final check. Estimate the size of {xp,xn}^k before computing the power + with full precision. Optimization: It might pay off to make a more + accurate estimation of the logarithm of {xp,xn}, rather than using the + index of the MSB. */ + + MPN_SIZEINBASE_2EXP(y, xp, xn, 1); + y -= 1; /* msb_index (xp, xn) */ + + umul_ppmm (h, l, k, y); + h -= l == 0; --l; /* two-limb decrement */ + + z = f - 1; /* msb_index (np, n) */ + if (h == 0 && l <= z) + { + mp_limb_t *tp2; + mp_size_t i; + int ans; + mp_limb_t size; + TMP_DECL; + + size = l + k; + ASSERT_ALWAYS (size >= k); + + TMP_MARK; + y = 2 + size / GMP_LIMB_BITS; + tp2 = TMP_ALLOC_LIMBS (y); + + i = mpn_pow_1 (tp, xp, xn, k, tp2); + if (i == n && mpn_cmp (tp, np, n) == 0) + ans = 1; + else + ans = 0; + TMP_FREE; + return ans; + } + + return 0; +} + + +/* Return non-zero if N = {np,n} is a kth power. + I = {ip,n} = N^(-1) mod B^n. */ +static int +is_kth_power (mp_ptr rp, mp_srcptr np, + mp_limb_t k, mp_srcptr ip, + mp_size_t n, mp_bitcnt_t f, + mp_ptr tp) +{ + mp_bitcnt_t b; + mp_size_t rn, xn; + + ASSERT (n > 0); + ASSERT ((k & 1) != 0 || k == 2); + ASSERT ((np[0] & 1) != 0); + + if (k == 2) + { + b = (f + 1) >> 1; + rn = 1 + b / GMP_LIMB_BITS; + if (mpn_bsqrtinv (rp, ip, b, tp) != 0) + { + rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1; + xn = rn; + MPN_NORMALIZE (rp, xn); + if (pow_equals (np, n, rp, xn, k, f, tp) != 0) + return 1; + + /* Check if (2^b - r)^2 == n */ + mpn_neg (rp, rp, rn); + rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1; + MPN_NORMALIZE (rp, rn); + if (pow_equals (np, n, rp, rn, k, f, tp) != 0) + return 1; + } + } + else + { + b = 1 + (f - 1) / k; + rn = 1 + (b - 1) / GMP_LIMB_BITS; + mpn_brootinv (rp, ip, rn, k, tp); + if ((b % GMP_LIMB_BITS) != 0) + rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1; + MPN_NORMALIZE (rp, rn); + if (pow_equals (np, n, rp, rn, k, f, tp) != 0) + return 1; + } + MPN_ZERO (rp, rn); /* Untrash rp */ + return 0; +} + +static int +perfpow (mp_srcptr np, mp_size_t n, + mp_limb_t ub, mp_limb_t g, + mp_bitcnt_t f, int neg) +{ + mp_ptr ip, tp, rp; + mp_limb_t k; + int ans; + mp_bitcnt_t b; + gmp_primesieve_t ps; + TMP_DECL; + + ASSERT (n > 0); + ASSERT ((np[0] & 1) != 0); + ASSERT (ub > 0); + + TMP_MARK; + gmp_init_primesieve (&ps); + b = (f + 3) >> 1; + + TMP_ALLOC_LIMBS_3 (ip, n, rp, n, tp, 5 * n); + + MPN_ZERO (rp, n); + + /* FIXME: It seems the inverse in ninv is needed only to get non-inverted + roots. I.e., is_kth_power computes n^{1/2} as (n^{-1})^{-1/2} and + similarly for nth roots. It should be more efficient to compute n^{1/2} as + n * n^{-1/2}, with a mullo instead of a binvert. And we can do something + similar for kth roots if we switch to an iteration converging to n^{1/k - + 1}, and we can then eliminate this binvert call. */ + mpn_binvert (ip, np, 1 + (b - 1) / GMP_LIMB_BITS, tp); + if (b % GMP_LIMB_BITS) + ip[(b - 1) / GMP_LIMB_BITS] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1; + + if (neg) + gmp_nextprime (&ps); + + ans = 0; + if (g > 0) + { + ub = MIN (ub, g + 1); + while ((k = gmp_nextprime (&ps)) < ub) + { + if ((g % k) == 0) + { + if (is_kth_power (rp, np, k, ip, n, f, tp) != 0) + { + ans = 1; + goto ret; + } + } + } + } + else + { + while ((k = gmp_nextprime (&ps)) < ub) + { + if (is_kth_power (rp, np, k, ip, n, f, tp) != 0) + { + ans = 1; + goto ret; + } + } + } + ret: + TMP_FREE; + return ans; +} + +static const unsigned short nrtrial[] = { 100, 500, 1000 }; + +/* Table of (log_{p_i} 2) values, where p_i is the (nrtrial[i] + 1)'th prime + number. */ +static const double logs[] = + { 0.1099457228193620, 0.0847016403115322, 0.0772048195144415 }; + +int +mpn_perfect_power_p (mp_srcptr np, mp_size_t n) +{ + mp_limb_t *nc, factor, g; + mp_limb_t exp, d; + mp_bitcnt_t twos, count; + int ans, where, neg, trial; + TMP_DECL; + + neg = n < 0; + if (neg) + { + n = -n; + } + + if (n == 0 || (n == 1 && np[0] == 1)) /* Valgrind doesn't like + (n <= (np[0] == 1)) */ + return 1; + + TMP_MARK; + + count = 0; + + twos = mpn_scan1 (np, 0); + if (twos != 0) + { + mp_size_t s; + if (twos == 1) + { + return 0; + } + s = twos / GMP_LIMB_BITS; + if (s + 1 == n && POW2_P (np[s])) + { + return ! (neg && POW2_P (twos)); + } + count = twos % GMP_LIMB_BITS; + n -= s; + np += s; + if (count > 0) + { + nc = TMP_ALLOC_LIMBS (n); + mpn_rshift (nc, np, n, count); + n -= (nc[n - 1] == 0); + np = nc; + } + } + g = twos; + + trial = (n > SMALL) + (n > MEDIUM); + + where = 0; + factor = mpn_trialdiv (np, n, nrtrial[trial], &where); + + if (factor != 0) + { + if (count == 0) /* We did not allocate nc yet. */ + { + nc = TMP_ALLOC_LIMBS (n); + } + + /* Remove factors found by trialdiv. Optimization: If remove + define _itch, we can allocate its scratch just once */ + + do + { + binvert_limb (d, factor); + + /* After the first round we always have nc == np */ + exp = mpn_remove (nc, &n, np, n, &d, 1, ~(mp_bitcnt_t)0); + + if (g == 0) + g = exp; + else + g = mpn_gcd_1 (&g, 1, exp); + + if (g == 1) + { + ans = 0; + goto ret; + } + + if ((n == 1) & (nc[0] == 1)) + { + ans = ! (neg && POW2_P (g)); + goto ret; + } + + np = nc; + factor = mpn_trialdiv (np, n, nrtrial[trial], &where); + } + while (factor != 0); + } + + MPN_SIZEINBASE_2EXP(count, np, n, 1); /* log (np) + 1 */ + d = (mp_limb_t) (count * logs[trial] + 1e-9) + 1; + ans = perfpow (np, n, d, g, count, neg); + + ret: + TMP_FREE; + return ans; +} diff --git a/gmp-6.3.0/mpn/generic/perfsqr.c b/gmp-6.3.0/mpn/generic/perfsqr.c new file mode 100644 index 0000000..1ea5c84 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/perfsqr.c @@ -0,0 +1,238 @@ +/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, + zero otherwise. + +Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include /* for NULL */ +#include "gmp-impl.h" +#include "longlong.h" + +#include "perfsqr.h" + + +/* change this to "#define TRACE(x) x" for diagnostics */ +#define TRACE(x) + + + +/* PERFSQR_MOD_* detects non-squares using residue tests. + + A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes + {up,usize} modulo a selected modulus to get a remainder r. For 32-bit or + 64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34, + or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP + used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this + case too. + + PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or + PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table + data indicating residues and non-residues modulo those divisors. The + table data is in 1 or 2 limbs worth of bits respectively, per the size of + each d. + + A "modexact" style remainder is taken to reduce r modulo d. + PERFSQR_MOD_IDX implements this, producing an index "idx" for use with + the table data. Notice there's just one multiplication by a constant + "inv", for each d. + + The modexact doesn't produce a true r%d remainder, instead idx satisfies + "-(idx<> MOD34_BITS); \ + } while (0) + +/* FIXME: The %= here isn't good, and might destroy any savings from keeping + the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm). + Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor + and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our + normal case, so lets not worry too much about mod_1. */ +#define PERFSQR_MOD_PP(r, up, usize) \ + do { \ + if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \ + { \ + (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \ + PERFSQR_PP_INVERTED); \ + (r) %= PERFSQR_PP; \ + } \ + else \ + { \ + (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \ + } \ + } while (0) + +#define PERFSQR_MOD_IDX(idx, r, d, inv) \ + do { \ + mp_limb_t q; \ + ASSERT ((r) <= PERFSQR_MOD_MASK); \ + ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \ + ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \ + \ + q = ((r) * (inv)) & PERFSQR_MOD_MASK; \ + ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \ + (idx) = (q * (d)) >> PERFSQR_MOD_BITS; \ + } while (0) + +#define PERFSQR_MOD_1(r, d, inv, mask) \ + do { \ + unsigned idx; \ + ASSERT ((d) <= GMP_LIMB_BITS); \ + PERFSQR_MOD_IDX(idx, r, d, inv); \ + TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \ + d, r%d, idx)); \ + if ((((mask) >> idx) & 1) == 0) \ + { \ + TRACE (printf (" non-square\n")); \ + return 0; \ + } \ + } while (0) + +/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the + sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */ +#define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \ + do { \ + mp_limb_t m; \ + unsigned idx; \ + ASSERT ((d) <= 2*GMP_LIMB_BITS); \ + \ + PERFSQR_MOD_IDX (idx, r, d, inv); \ + TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \ + d, r%d, idx)); \ + m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \ + idx %= GMP_LIMB_BITS; \ + if (((m >> idx) & 1) == 0) \ + { \ + TRACE (printf (" non-square\n")); \ + return 0; \ + } \ + } while (0) + + +int +mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) +{ + ASSERT (usize >= 1); + + TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize)); + + /* The first test excludes 212/256 (82.8%) of the perfect square candidates + in O(1) time. */ + { + unsigned idx = up[0] % 0x100; + if (((sq_res_0x100[idx / GMP_LIMB_BITS] + >> (idx % GMP_LIMB_BITS)) & 1) == 0) + return 0; + } + +#if 0 + /* Check that we have even multiplicity of 2, and then check that the rest is + a possible perfect square. Leave disabled until we can determine this + really is an improvement. If it is, it could completely replace the + simple probe above, since this should throw out more non-squares, but at + the expense of somewhat more cycles. */ + { + mp_limb_t lo; + int cnt; + lo = up[0]; + while (lo == 0) + up++, lo = up[0], usize--; + count_trailing_zeros (cnt, lo); + if ((cnt & 1) != 0) + return 0; /* return of not even multiplicity of 2 */ + lo >>= cnt; /* shift down to align lowest non-zero bit */ + if ((lo & 6) != 0) + return 0; + } +#endif + + + /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares + according to their residues modulo small primes (or powers of + primes). See perfsqr.h. */ + PERFSQR_MOD_TEST (up, usize); + + + /* For the third and last test, we finally compute the square root, + to make sure we've really got a perfect square. */ + { + mp_ptr root_ptr; + int res; + TMP_DECL; + + TMP_MARK; + root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2); + + /* Iff mpn_sqrtrem returns zero, the square is perfect. */ + res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); + TMP_FREE; + + return res; + } +} diff --git a/gmp-6.3.0/mpn/generic/popham.c b/gmp-6.3.0/mpn/generic/popham.c new file mode 100644 index 0000000..87974d7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/popham.c @@ -0,0 +1,125 @@ +/* mpn_popcount, mpn_hamdist -- mpn bit population count/hamming distance. + +Copyright 1994, 1996, 2000-2002, 2005, 2011, 2012 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if OPERATION_popcount +#define FNAME mpn_popcount +#define POPHAM(u,v) u +#endif + +#if OPERATION_hamdist +#define FNAME mpn_hamdist +#define POPHAM(u,v) u ^ v +#endif + +mp_bitcnt_t +FNAME (mp_srcptr up, +#if OPERATION_hamdist + mp_srcptr vp, +#endif + mp_size_t n) __GMP_NOTHROW +{ + mp_bitcnt_t result = 0; + mp_limb_t p0, p1, p2, p3, x, p01, p23; + mp_size_t i; + + ASSERT (n >= 1); /* Actually, this code handles any n, but some + assembly implementations do not. */ + + for (i = n >> 2; i != 0; i--) + { + p0 = POPHAM (up[0], vp[0]); + p0 -= (p0 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p0 = ((p0 >> 2) & MP_LIMB_T_MAX/5) + (p0 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + + p1 = POPHAM (up[1], vp[1]); + p1 -= (p1 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p1 = ((p1 >> 2) & MP_LIMB_T_MAX/5) + (p1 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + + p01 = p0 + p1; /* 8 0-8 */ + p01 = ((p01 >> 4) & MP_LIMB_T_MAX/17) + (p01 & MP_LIMB_T_MAX/17); /* 8 0-16 */ + + p2 = POPHAM (up[2], vp[2]); + p2 -= (p2 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p2 = ((p2 >> 2) & MP_LIMB_T_MAX/5) + (p2 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + + p3 = POPHAM (up[3], vp[3]); + p3 -= (p3 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p3 = ((p3 >> 2) & MP_LIMB_T_MAX/5) + (p3 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + + p23 = p2 + p3; /* 8 0-8 */ + p23 = ((p23 >> 4) & MP_LIMB_T_MAX/17) + (p23 & MP_LIMB_T_MAX/17); /* 8 0-16 */ + + x = p01 + p23; /* 8 0-32 */ + x = (x >> 8) + x; /* 8 0-64 */ + x = (x >> 16) + x; /* 8 0-128 */ +#if GMP_LIMB_BITS > 32 + x = ((x >> 32) & 0xff) + (x & 0xff); /* 8 0-256 */ + result += x; +#else + result += x & 0xff; +#endif + up += 4; +#if OPERATION_hamdist + vp += 4; +#endif + } + + n &= 3; + if (n != 0) + { + x = 0; + do + { + p0 = POPHAM (up[0], vp[0]); + p0 -= (p0 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p0 = ((p0 >> 2) & MP_LIMB_T_MAX/5) + (p0 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + p0 = ((p0 >> 4) + p0) & MP_LIMB_T_MAX/17; /* 8 0-8 */ + + x += p0; + up += 1; +#if OPERATION_hamdist + vp += 1; +#endif + } + while (--n); + + x = (x >> 8) + x; + x = (x >> 16) + x; +#if GMP_LIMB_BITS > 32 + x = (x >> 32) + x; +#endif + result += x & 0xff; + } + + return result; +} diff --git a/gmp-6.3.0/mpn/generic/pow_1.c b/gmp-6.3.0/mpn/generic/pow_1.c new file mode 100644 index 0000000..de11cd2 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/pow_1.c @@ -0,0 +1,135 @@ +/* mpn_pow_1 -- Compute powers R = U^exp. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2002, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +mp_size_t +mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp) +{ + mp_limb_t x; + int cnt, i; + mp_size_t rn; + int par; + + ASSERT (bn >= 1); + /* FIXME: Add operand overlap criteria */ + + if (exp <= 1) + { + if (exp == 0) + { + rp[0] = 1; + return 1; + } + else + { + MPN_COPY (rp, bp, bn); + return bn; + } + } + + /* Count number of bits in exp, and compute where to put initial square in + order to magically get results in the entry rp. Use simple code, + optimized for small exp. For large exp, the bignum operations will take + so much time that the slowness of this code will be negligible. */ + par = 0; + cnt = GMP_LIMB_BITS; + x = exp; + do + { + par ^= x; + cnt--; + x >>= 1; + } while (x != 0); + exp <<= cnt; + + if (bn == 1) + { + mp_limb_t rl, rh, bl = bp[0]; + + if ((cnt & 1) != 0) + MP_PTR_SWAP (rp, tp); + + umul_ppmm (rh, rl, bl, bl << GMP_NAIL_BITS); + rp[0] = rl >> GMP_NAIL_BITS; + rp[1] = rh; + rn = 1 + (rh != 0); + + for (i = GMP_LIMB_BITS - cnt - 1;;) + { + exp <<= 1; + if ((exp & GMP_LIMB_HIGHBIT) != 0) + { + rp[rn] = rh = mpn_mul_1 (rp, rp, rn, bl); + rn += rh != 0; + } + + if (--i == 0) + break; + + mpn_sqr (tp, rp, rn); + rn = 2 * rn; rn -= tp[rn - 1] == 0; + MP_PTR_SWAP (rp, tp); + } + } + else + { + if (((par ^ cnt) & 1) == 0) + MP_PTR_SWAP (rp, tp); + + mpn_sqr (rp, bp, bn); + rn = 2 * bn; rn -= rp[rn - 1] == 0; + + for (i = GMP_LIMB_BITS - cnt - 1;;) + { + exp <<= 1; + if ((exp & GMP_LIMB_HIGHBIT) != 0) + { + rn = rn + bn - (mpn_mul (tp, rp, rn, bp, bn) == 0); + MP_PTR_SWAP (rp, tp); + } + + if (--i == 0) + break; + + mpn_sqr (tp, rp, rn); + rn = 2 * rn; rn -= tp[rn - 1] == 0; + MP_PTR_SWAP (rp, tp); + } + } + + return rn; +} diff --git a/gmp-6.3.0/mpn/generic/powlo.c b/gmp-6.3.0/mpn/generic/powlo.c new file mode 100644 index 0000000..c109512 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/powlo.c @@ -0,0 +1,188 @@ +/* mpn_powlo -- Compute R = U^E mod B^n, where B is the limb base. + +Copyright 2007-2009, 2012, 2015, 2016, 2018, 2020 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + + +#define getbit(p,bi) \ + ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1) + +static inline mp_limb_t +getbits (const mp_limb_t *p, mp_bitcnt_t bi, unsigned nbits) +{ + unsigned nbits_in_r; + mp_limb_t r; + mp_size_t i; + + if (bi <= nbits) + { + return p[0] & (((mp_limb_t) 1 << bi) - 1); + } + else + { + bi -= nbits; /* bit index of low bit to extract */ + i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */ + bi %= GMP_NUMB_BITS; /* bit index in low word */ + r = p[i] >> bi; /* extract (low) bits */ + nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */ + if (nbits_in_r < nbits) /* did we get enough bits? */ + r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */ + return r & (((mp_limb_t) 1 << nbits) - 1); + } +} + +static inline unsigned +win_size (mp_bitcnt_t eb) +{ + unsigned k; + static mp_bitcnt_t x[] = {7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0}; + ASSERT (eb > 1); + for (k = 0; eb > x[k++];) + ; + return k; +} + +/* rp[n-1..0] = bp[n-1..0] ^ ep[en-1..0] mod B^n, B is the limb base. + Requires that ep[en-1] is non-zero. + Uses scratch space tp[3n-1..0], i.e., 3n words. */ +/* We only use n words in the scratch space, we should pass tp + n to + mullo/sqrlo as a temporary area, it is needed. */ +void +mpn_powlo (mp_ptr rp, mp_srcptr bp, + mp_srcptr ep, mp_size_t en, + mp_size_t n, mp_ptr tp) +{ + unsigned cnt; + mp_bitcnt_t ebi; + unsigned windowsize, this_windowsize; + mp_limb_t expbits; + mp_limb_t *pp; + long i; + int flipflop; + TMP_DECL; + + ASSERT (en > 1 || (en == 1 && ep[0] > 1)); + + TMP_MARK; + + MPN_SIZEINBASE_2EXP(ebi, ep, en, 1); + + windowsize = win_size (ebi); + if (windowsize > 1) + { + mp_limb_t *this_pp, *last_pp; + ASSERT (windowsize < ebi); + + pp = TMP_ALLOC_LIMBS ((n << (windowsize - 1))); + + this_pp = pp; + + MPN_COPY (this_pp, bp, n); + + /* Store b^2 in tp. */ + mpn_sqrlo (tp, bp, n); + + /* Precompute odd powers of b and put them in the temporary area at pp. */ + i = (1 << (windowsize - 1)) - 1; + do + { + last_pp = this_pp; + this_pp += n; + mpn_mullo_n (this_pp, last_pp, tp, n); + } while (--i != 0); + + expbits = getbits (ep, ebi, windowsize); + ebi -= windowsize; + + /* THINK: Should we initialise the case expbits % 4 == 0 with a mullo? */ + count_trailing_zeros (cnt, expbits); + ebi += cnt; + expbits >>= cnt; + + MPN_COPY (rp, pp + n * (expbits >> 1), n); + } + else + { + pp = tp + n; + MPN_COPY (pp, bp, n); + MPN_COPY (rp, bp, n); + --ebi; + } + + flipflop = 0; + + do + { + while (getbit (ep, ebi) == 0) + { + mpn_sqrlo (tp, rp, n); + MP_PTR_SWAP (rp, tp); + flipflop = ! flipflop; + if (--ebi == 0) + goto done; + } + + /* The next bit of the exponent is 1. Now extract the largest block of + bits <= windowsize, and such that the least significant bit is 1. */ + + expbits = getbits (ep, ebi, windowsize); + this_windowsize = MIN (windowsize, ebi); + + count_trailing_zeros (cnt, expbits); + this_windowsize -= cnt; + ebi -= this_windowsize; + expbits >>= cnt; + + while (this_windowsize > 1) + { + mpn_sqrlo (tp, rp, n); + mpn_sqrlo (rp, tp, n); + this_windowsize -= 2; + } + + if (this_windowsize != 0) + mpn_sqrlo (tp, rp, n); + else + { + MP_PTR_SWAP (rp, tp); + flipflop = ! flipflop; + } + + mpn_mullo_n (rp, tp, pp + n * (expbits >> 1), n); + } while (ebi != 0); + + done: + if (flipflop) + MPN_COPY (tp, rp, n); + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/powm.c b/gmp-6.3.0/mpn/generic/powm.c new file mode 100644 index 0000000..1e30f2f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/powm.c @@ -0,0 +1,1003 @@ +/* mpn_powm -- Compute R = U^E mod M. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2007-2012, 2019-2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd. + + 1. W <- U + + 2. T <- (B^n * U) mod M Convert to REDC form + + 3. Compute table U^1, U^3, U^5... of E-dependent size + + 4. While there are more bits in E + W <- power left-to-right base-k + + + TODO: + + * Make getbits a macro, thereby allowing it to update the index operand. + That will simplify the code using getbits. (Perhaps make getbits' sibling + getbit then have similar form, for symmetry.) + + * Write an itch function. Or perhaps get rid of tp parameter since the huge + pp area is allocated locally anyway? + + * Choose window size without looping. (Superoptimize or think(tm).) + + * Handle small bases with initial, reduction-free exponentiation. + + * Call new division functions, not mpn_tdiv_qr. + + * Consider special code for one-limb M. + + * How should we handle the redc1/redc2/redc_n choice? + - redc1: T(binvert_1limb) + e * (n) * (T(mullo-1x1) + n*T(addmul_1)) + - redc2: T(binvert_2limbs) + e * (n/2) * (T(mullo-2x2) + n*T(addmul_2)) + - redc_n: T(binvert_nlimbs) + e * (T(mullo-nxn) + T(M(n))) + This disregards the addmul_N constant term, but we could think of + that as part of the respective mullo. + + * When U (the base) is small, we should start the exponentiation with plain + operations, then convert that partial result to REDC form. + + * When U is just one limb, should it be handled without the k-ary tricks? + We could keep a factor of B^n in W, but use U' = BU as base. After + multiplying by this (pseudo two-limb) number, we need to multiply by 1/B + mod M. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +#undef MPN_REDC_0 +#define MPN_REDC_0(r0, u1, u0, m0, invm) \ + do { \ + mp_limb_t _p1, _u1, _u0, _m0, _r0, _dummy; \ + _u0 = (u0); \ + _m0 = (m0); \ + umul_ppmm (_p1, _dummy, _m0, (_u0 * (invm)) & GMP_NUMB_MASK); \ + ASSERT (((_u0 - _dummy) & GMP_NUMB_MASK) == 0); \ + _u1 = (u1); \ + _r0 = _u1 - _p1; \ + _r0 = _u1 < _p1 ? _r0 + _m0 : _r0; /* _u1 < _r0 */ \ + (r0) = _r0 & GMP_NUMB_MASK; \ + } while (0) + +#undef MPN_REDC_1 +#if HAVE_NATIVE_mpn_sbpi1_bdiv_r +#define MPN_REDC_1(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm); \ + if (cy != 0) \ + mpn_sub_n (rp, up + n, mp, n); \ + else \ + MPN_COPY (rp, up + n, n); \ + } while (0) +#else +#define MPN_REDC_1(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + cy = mpn_redc_1 (rp, up, mp, n, invm); \ + if (cy != 0) \ + mpn_sub_n (rp, rp, mp, n); \ + } while (0) +#endif + +#undef MPN_REDC_2 +#define MPN_REDC_2(rp, up, mp, n, mip) \ + do { \ + mp_limb_t cy; \ + cy = mpn_redc_2 (rp, up, mp, n, mip); \ + if (cy != 0) \ + mpn_sub_n (rp, rp, mp, n); \ + } while (0) + +#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 +#define WANT_REDC_2 1 +#endif + +#define getbit(p,bi) \ + ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1) + +static inline mp_limb_t +getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits) +{ + int nbits_in_r; + mp_limb_t r; + mp_size_t i; + + if (bi <= nbits) + { + return p[0] & (((mp_limb_t) 1 << bi) - 1); + } + else + { + bi -= nbits; /* bit index of low bit to extract */ + i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */ + bi %= GMP_NUMB_BITS; /* bit index in low word */ + r = p[i] >> bi; /* extract (low) bits */ + nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */ + if (nbits_in_r < nbits) /* did we get enough bits? */ + r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */ + return r & (((mp_limb_t) 1 << nbits) - 1); + } +} + +static inline int +win_size (mp_bitcnt_t eb) +{ + int k; + static mp_bitcnt_t x[] = {7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0}; + for (k = 0; eb > x[k++]; ) + ; + return k; +} + +/* Convert U to REDC form, U_r = B^n * U mod M */ +static void +redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n) +{ + mp_ptr tp, qp; + TMP_DECL; + TMP_MARK; + + TMP_ALLOC_LIMBS_2 (tp, un + n, qp, un + 1); + + MPN_ZERO (tp, n); + MPN_COPY (tp + n, up, un); + mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n); + TMP_FREE; +} + +#if ! HAVE_NATIVE_mpn_rsblsh1_n_ip2 +#undef mpn_rsblsh1_n_ip2 +#if HAVE_NATIVE_mpn_rsblsh1_n +#define mpn_rsblsh1_n_ip2(a,b,n) mpn_rsblsh1_n(a,b,a,n) +#else +#define mpn_rsblsh1_n_ip2(a,b,n) \ + do \ + { \ + mpn_lshift (a, a, n, 1); \ + mpn_sub_n (a, a, b, n); \ + } while (0) +#endif +#endif + +#define INNERLOOP2 \ + do \ + { \ + MPN_SQR (tp, rp, n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + if (mpn_cmp (rp, mp, n) >= 0) \ + ASSERT_NOCARRY (mpn_sub_n (rp, rp, mp, n)); \ + if (getbit (ep, ebi) != 0) \ + { \ + if (rp[n - 1] >> (mbi - 1) % GMP_LIMB_BITS == 0) \ + ASSERT_NOCARRY (mpn_lshift (rp, rp, n, 1)); \ + else \ + mpn_rsblsh1_n_ip2 (rp, mp, n); \ + } \ + } while (--ebi != 0) + +/* rp[n-1..0] = 2 ^ ep[en-1..0] mod mp[n-1..0] + Requires that mp[n-1..0] is odd and > 1. + Requires that ep[en-1..0] is > 1. + Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs. */ +static void +mpn_2powm (mp_ptr rp, mp_srcptr ep, mp_size_t en, + mp_srcptr mp, mp_size_t n, mp_ptr tp) +{ + mp_limb_t ip[2], *mip; + mp_bitcnt_t ebi, mbi, tbi; + mp_size_t tn; + int count; + TMP_DECL; + + ASSERT (en > 1 || (en == 1 && ep[0] > 1)); + ASSERT (n > 0 && (mp[0] & 1) != 0); + + MPN_SIZEINBASE_2EXP(ebi, ep, en, 1); + MPN_SIZEINBASE_2EXP(mbi, mp, n, 1); + + if (LIKELY (mbi <= GMP_NUMB_MAX)) + { + count_leading_zeros(count, (mp_limb_t) mbi); + count = GMP_NUMB_BITS - (count - GMP_NAIL_BITS); + } + else + { + mp_bitcnt_t tc = mbi; + + count = 0; + do { ++count; } while ((tc >>= 1) != 0); + } + + tbi = getbits (ep, ebi, count); + if (tbi >= mbi) + { + --count; + ASSERT ((tbi >> count) == 1); + tbi >>= 1; + ASSERT (tbi < mbi); + ASSERT (ebi > count); + } + else if (ebi <= count) + { + MPN_FILL (rp, n, 0); + rp[tbi / GMP_LIMB_BITS] = CNST_LIMB (1) << (tbi % GMP_LIMB_BITS); + return; + } + ebi -= count; + + if (n == 1) + { + mp_limb_t r0, m0, invm; + m0 = *mp; + + /* redcify (rp, tp, tn + 1, mp, n); */ + /* TODO: test direct use of udiv_qrnnd */ + ASSERT (tbi < GMP_LIMB_BITS); + tp[1] = CNST_LIMB (1) << tbi; + tp[0] = CNST_LIMB (0); + r0 = mpn_mod_1 (tp, 2, m0); + + binvert_limb (invm, m0); + do + { + mp_limb_t t0, t1, t2; + /* MPN_SQR (tp, rp, n); */ + umul_ppmm (t1, t0, r0, r0); + /* MPN_REDUCE (rp, tp, mp, n, mip); */ + MPN_REDC_0(r0, t1, t0, m0, invm); + + t2 = r0 << 1; + t2 = r0 > (m0 >> 1) ? t2 - m0 : t2; + r0 = getbit (ep, ebi) != 0 ? t2 : r0; + } while (--ebi != 0); + + /* tp[1] = 0; tp[0] = r0; */ + /* MPN_REDUCE (rp, tp, mp, n, mip); */ + MPN_REDC_0(*rp, 0, r0, m0, invm); + + return; + } + + TMP_MARK; + +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + mip = ip; + binvert_limb (ip[0], mp[0]); + ip[0] = -ip[0]; + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { + mip = ip; + mpn_binvert (ip, mp, 2, tp); + ip[0] = -ip[0]; ip[1] = ~ip[1]; + } +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { + mip = ip; + binvert_limb (ip[0], mp[0]); + ip[0] = -ip[0]; + } +#endif + else + { + mip = TMP_ALLOC_LIMBS (n); + mpn_binvert (mip, mp, n, tp); + } + + tn = tbi / GMP_LIMB_BITS; + MPN_ZERO (tp, tn); + tp[tn] = CNST_LIMB (1) << (tbi % GMP_LIMB_BITS); + + redcify (rp, tp, tn + 1, mp, n); + +#if WANT_REDC_2 + if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD) + { + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + else + { + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + +#else /* WANT_REDC_2 */ + + if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD) + { + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { + if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + else + { + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } +#endif /* WANT_REDC_2 */ + + MPN_COPY (tp, rp, n); + MPN_FILL (tp + n, n, 0); + +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, ip[0]); + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + MPN_REDC_2 (rp, tp, mp, n, mip); +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, ip[0]); +#endif + else + mpn_redc_n (rp, tp, mp, n, mip); + + if (mpn_cmp (rp, mp, n) >= 0) + mpn_sub_n (rp, rp, mp, n); + + TMP_FREE; +} + +/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0] + Requires that mp[n-1..0] is odd. + Requires that ep[en-1..0] is > 1. + Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs. */ +void +mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, + mp_srcptr ep, mp_size_t en, + mp_srcptr mp, mp_size_t n, mp_ptr tp) +{ + mp_limb_t ip[2], *mip; + int cnt; + mp_bitcnt_t ebi; + int windowsize, this_windowsize; + mp_limb_t expbits; + mp_ptr pp, this_pp; + long i; + TMP_DECL; + + ASSERT (en > 1 || (en == 1 && ep[0] > 1)); + ASSERT (n >= 1 && ((mp[0] & 1) != 0)); + + if (bn == 1 && bp[0] == 2) + { + mpn_2powm (rp, ep, en, mp, n, tp); + return; + } + + TMP_MARK; + + MPN_SIZEINBASE_2EXP(ebi, ep, en, 1); + +#if 0 + if (bn < n) + { + /* Do the first few exponent bits without mod reductions, + until the result is greater than the mod argument. */ + for (;;) + { + mpn_sqr (tp, this_pp, tn); + tn = tn * 2 - 1, tn += tp[tn] != 0; + if (getbit (ep, ebi) != 0) + mpn_mul (..., tp, tn, bp, bn); + ebi--; + } + } +#endif + + windowsize = win_size (ebi); + +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + mip = ip; + binvert_limb (mip[0], mp[0]); + mip[0] = -mip[0]; + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { + mip = ip; + mpn_binvert (mip, mp, 2, tp); + mip[0] = -mip[0]; mip[1] = ~mip[1]; + } +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { + mip = ip; + binvert_limb (mip[0], mp[0]); + mip[0] = -mip[0]; + } +#endif + else + { + mip = TMP_ALLOC_LIMBS (n); + mpn_binvert (mip, mp, n, tp); + } + + pp = TMP_ALLOC_LIMBS (n << (windowsize - 1)); + + this_pp = pp; + redcify (this_pp, bp, bn, mp, n); + + /* Store b^2 at rp. */ + mpn_sqr (tp, this_pp, n); +#if 0 + if (n == 1) { + MPN_REDC_0 (rp[0], tp[1], tp[0], mp[0], -mip[0]); + } else +#endif +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, mip[0]); + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + MPN_REDC_2 (rp, tp, mp, n, mip); +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, mip[0]); +#endif + else + mpn_redc_n (rp, tp, mp, n, mip); + + /* Precompute odd powers of b and put them in the temporary area at pp. */ + for (i = (1 << (windowsize - 1)) - 1; i > 0; i--) +#if 1 + if (n == 1) { + umul_ppmm((tp)[1], *(tp), *(this_pp), *(rp)); + ++this_pp ; + MPN_REDC_0 (*this_pp, tp[1], tp[0], *mp, -mip[0]); + } else +#endif + { + mpn_mul_n (tp, this_pp, rp, n); + this_pp += n; +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1 (this_pp, tp, mp, n, mip[0]); + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + MPN_REDC_2 (this_pp, tp, mp, n, mip); +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + MPN_REDC_1 (this_pp, tp, mp, n, mip[0]); +#endif + else + mpn_redc_n (this_pp, tp, mp, n, mip); + } + + expbits = getbits (ep, ebi, windowsize); + ebi -= windowsize; + + /* THINK: Should we initialise the case expbits % 4 == 0 with a mul? */ + count_trailing_zeros (cnt, expbits); + ebi += cnt; + expbits >>= cnt; + + MPN_COPY (rp, pp + n * (expbits >> 1), n); + +#define INNERLOOP \ + while (ebi != 0) \ + { \ + while (getbit (ep, ebi) == 0) \ + { \ + MPN_SQR (tp, rp, n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + if (--ebi == 0) \ + goto done; \ + } \ + \ + /* The next bit of the exponent is 1. Now extract the largest \ + block of bits <= windowsize, and such that the least \ + significant bit is 1. */ \ + \ + expbits = getbits (ep, ebi, windowsize); \ + this_windowsize = MIN (ebi, windowsize); \ + \ + count_trailing_zeros (cnt, expbits); \ + this_windowsize -= cnt; \ + ebi -= this_windowsize; \ + expbits >>= cnt; \ + \ + do \ + { \ + MPN_SQR (tp, rp, n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + } \ + while (--this_windowsize != 0); \ + \ + MPN_MUL_N (tp, rp, pp + n * (expbits >> 1), n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + } + + + if (n == 1) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) umul_ppmm((r)[1], *(r), *(a), *(b)) +#define MPN_SQR(r,a,n) umul_ppmm((r)[1], *(r), *(a), *(a)) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_0(*(rp), (tp)[1], (tp)[0], *(mp), - *(mip)) + INNERLOOP; + } + else +#if WANT_REDC_2 + if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD) + { + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } + else + { + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } + +#else /* WANT_REDC_2 */ + + if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD) + { + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { + if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } + else + { + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } +#endif /* WANT_REDC_2 */ + + done: + + MPN_COPY (tp, rp, n); + MPN_ZERO (tp + n, n); + +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, mip[0]); + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + MPN_REDC_2 (rp, tp, mp, n, mip); +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, mip[0]); +#endif + else + mpn_redc_n (rp, tp, mp, n, mip); + + if (mpn_cmp (rp, mp, n) >= 0) + mpn_sub_n (rp, rp, mp, n); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/pre_divrem_1.c b/gmp-6.3.0/mpn/generic/pre_divrem_1.c new file mode 100644 index 0000000..3b29d77 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/pre_divrem_1.c @@ -0,0 +1,145 @@ +/* mpn_preinv_divrem_1 -- mpn by limb division with pre-inverted divisor. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Don't bloat a shared library with unused code. */ +#if USE_PREINV_DIVREM_1 + +/* Same test here for skipping one divide step as in mpn_divrem_1. + + The main reason for a separate shift==0 case is that not all CPUs give + zero for "n0 >> GMP_LIMB_BITS" which would arise in the general case + code used on shift==0. shift==0 is also reasonably common in mp_bases + big_base, for instance base==10 on a 64-bit limb. + + Under shift!=0 it would be possible to call mpn_lshift to adjust the + dividend all in one go (into the quotient space say), rather than + limb-by-limb in the loop. This might help if mpn_lshift is a lot faster + than what the compiler can generate for EXTRACT. But this is left to CPU + specific implementations to consider, especially since EXTRACT isn't on + the dependent chain. + + If size==0 then the result is simply xsize limbs of zeros, but nothing + special is done for that, since it wouldn't be a usual call, and + certainly never arises from mpn_get_str which is our main caller. */ + +mp_limb_t +mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t xsize, + mp_srcptr ap, mp_size_t size, mp_limb_t d_unnorm, + mp_limb_t dinv, int shift) +{ + mp_limb_t ahigh, qhigh, r; + mp_size_t i; + mp_limb_t n1, n0; + mp_limb_t d; + + ASSERT (xsize >= 0); + ASSERT (size >= 1); + ASSERT (d_unnorm != 0); +#if WANT_ASSERT + { + int want_shift; + mp_limb_t want_dinv; + count_leading_zeros (want_shift, d_unnorm); + ASSERT (shift == want_shift); + invert_limb (want_dinv, d_unnorm << shift); + ASSERT (dinv == want_dinv); + } +#endif + /* FIXME: What's the correct overlap rule when xsize!=0? */ + ASSERT (MPN_SAME_OR_SEPARATE_P (qp+xsize, ap, size)); + + ahigh = ap[size-1]; + d = d_unnorm << shift; + qp += (size + xsize - 1); /* dest high limb */ + + if (shift == 0) + { + /* High quotient limb is 0 or 1, and skip a divide step. */ + r = ahigh; + qhigh = (r >= d); + r = (qhigh ? r-d : r); + *qp-- = qhigh; + size--; + + for (i = size-1; i >= 0; i--) + { + n0 = ap[i]; + udiv_qrnnd_preinv (*qp, r, r, n0, d, dinv); + qp--; + } + } + else + { + r = 0; + if (ahigh < d_unnorm) + { + r = ahigh << shift; + *qp-- = 0; + size--; + if (size == 0) + goto done_integer; + } + + n1 = ap[size-1]; + r |= n1 >> (GMP_LIMB_BITS - shift); + + for (i = size-2; i >= 0; i--) + { + ASSERT (r < d); + n0 = ap[i]; + udiv_qrnnd_preinv (*qp, r, r, + ((n1 << shift) | (n0 >> (GMP_LIMB_BITS - shift))), + d, dinv); + qp--; + n1 = n0; + } + udiv_qrnnd_preinv (*qp, r, r, n1 << shift, d, dinv); + qp--; + } + + done_integer: + for (i = 0; i < xsize; i++) + { + udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv); + qp--; + } + + return r >> shift; +} + +#endif /* USE_PREINV_DIVREM_1 */ diff --git a/gmp-6.3.0/mpn/generic/pre_mod_1.c b/gmp-6.3.0/mpn/generic/pre_mod_1.c new file mode 100644 index 0000000..78ae308 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/pre_mod_1.c @@ -0,0 +1,61 @@ +/* mpn_preinv_mod_1 (up, un, d, dinv) -- Divide (UP,,UN) by the normalized D. + DINV should be 2^(2*GMP_LIMB_BITS) / D - 2^GMP_LIMB_BITS. + Return the single-limb remainder. + +Copyright 1991, 1993, 1994, 2000-2002, 2004, 2005 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* This function used to be documented, but is now considered obsolete. It + continues to exist for binary compatibility, even when not required + internally. */ + +mp_limb_t +mpn_preinv_mod_1 (mp_srcptr up, mp_size_t un, mp_limb_t d, mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t n0, r; + + ASSERT (un >= 1); + ASSERT (d & GMP_LIMB_HIGHBIT); + + r = up[un - 1]; + if (r >= d) + r -= d; + + for (i = un - 2; i >= 0; i--) + { + n0 = up[i]; + udiv_rnnd_preinv (r, r, n0, d, dinv); + } + return r; +} diff --git a/gmp-6.3.0/mpn/generic/random.c b/gmp-6.3.0/mpn/generic/random.c new file mode 100644 index 0000000..485f9eb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/random.c @@ -0,0 +1,50 @@ +/* mpn_random -- Generate random numbers. + +Copyright 2001, 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_random (mp_ptr ptr, mp_size_t size) +{ + gmp_randstate_ptr rands; + + /* FIXME: Is size==0 supposed to be allowed? */ + ASSERT (size >= 0); + + if (size == 0) + return; + + rands = RANDS; + _gmp_rand (ptr, rands, size * GMP_NUMB_BITS); + + /* Make sure the most significant limb is non-zero. */ + while (ptr[size-1] == 0) + _gmp_rand (&ptr[size-1], rands, GMP_NUMB_BITS); +} diff --git a/gmp-6.3.0/mpn/generic/random2.c b/gmp-6.3.0/mpn/generic/random2.c new file mode 100644 index 0000000..1eede67 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/random2.c @@ -0,0 +1,105 @@ +/* mpn_random2 -- Generate random numbers with relatively long strings + of ones and zeroes. Suitable for border testing. + +Copyright 1992-1994, 1996, 2000-2002, 2004, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +static void gmp_rrandomb (mp_ptr, gmp_randstate_t, mp_bitcnt_t); + +/* Ask _gmp_rand for 32 bits per call unless that's more than a limb can hold. + Thus, we get the same random number sequence in the common cases. + FIXME: We should always generate the same random number sequence! */ +#if GMP_NUMB_BITS < 32 +#define BITS_PER_RANDCALL GMP_NUMB_BITS +#else +#define BITS_PER_RANDCALL 32 +#endif + +void +mpn_random2 (mp_ptr rp, mp_size_t n) +{ + gmp_randstate_ptr rstate = RANDS; + int bit_pos; /* bit number of least significant bit where + next bit field to be inserted */ + mp_limb_t ran, ranm; /* buffer for random bits */ + + /* FIXME: Is n==0 supposed to be allowed? */ + ASSERT (n >= 0); + + _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); + ran = ranm; + + /* Start off at a random bit position in the most significant limb. */ + bit_pos = ran % GMP_NUMB_BITS; + + gmp_rrandomb (rp, rstate, n * GMP_NUMB_BITS - bit_pos); +} + +static void +gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, mp_bitcnt_t nbits) +{ + mp_bitcnt_t bi; + mp_limb_t ranm; /* buffer for random bits */ + unsigned cap_chunksize, chunksize; + mp_size_t i; + + /* Set entire result to 111..1 */ + i = BITS_TO_LIMBS (nbits) - 1; + rp[i] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - (nbits % GMP_NUMB_BITS)) % GMP_NUMB_BITS; + for (i = i - 1; i >= 0; i--) + rp[i] = GMP_NUMB_MAX; + + _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); + cap_chunksize = nbits / (ranm % 4 + 1); + cap_chunksize += cap_chunksize == 0; /* make it at least 1 */ + + bi = nbits; + + for (;;) + { + _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); + chunksize = 1 + ranm % cap_chunksize; + bi = (bi < chunksize) ? 0 : bi - chunksize; + + if (bi == 0) + break; /* low chunk is ...1 */ + + rp[bi / GMP_NUMB_BITS] ^= CNST_LIMB (1) << bi % GMP_NUMB_BITS; + + _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); + chunksize = 1 + ranm % cap_chunksize; + bi = (bi < chunksize) ? 0 : bi - chunksize; + + mpn_incr_u (rp + bi / GMP_NUMB_BITS, CNST_LIMB (1) << bi % GMP_NUMB_BITS); + + if (bi == 0) + break; /* low chunk is ...0 */ + } +} diff --git a/gmp-6.3.0/mpn/generic/redc_1.c b/gmp-6.3.0/mpn/generic/redc_1.c new file mode 100644 index 0000000..eab128f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/redc_1.c @@ -0,0 +1,56 @@ +/* mpn_redc_1. Set rp[] <- up[]/R^n mod mp[]. Clobber up[]. + mp[] is n limbs; up[] is 2n limbs. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright (C) 2000-2002, 2004, 2008, 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm) +{ + mp_size_t j; + mp_limb_t cy; + + ASSERT (n > 0); + ASSERT_MPN (up, 2*n); + + for (j = n - 1; j >= 0; j--) + { + cy = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK); + ASSERT (up[0] == 0); + up[0] = cy; + up++; + } + + cy = mpn_add_n (rp, up, up - n, n); + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/redc_2.c b/gmp-6.3.0/mpn/generic/redc_2.c new file mode 100644 index 0000000..8d15589 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/redc_2.c @@ -0,0 +1,110 @@ +/* mpn_redc_2. Set rp[] <- up[]/R^n mod mp[]. Clobber up[]. + mp[] is n limbs; up[] is 2n limbs. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright (C) 2000-2002, 2004, 2008, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if GMP_NAIL_BITS != 0 +you lose +#endif + +/* For testing purposes, define our own mpn_addmul_2 if there is none already + available. */ +#ifndef HAVE_NATIVE_mpn_addmul_2 +#undef mpn_addmul_2 +static mp_limb_t +mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_srcptr vp) +{ + rp[n] = mpn_addmul_1 (rp, up, n, vp[0]); + return mpn_addmul_1 (rp + 1, up, n, vp[1]); +} +#endif + +#if defined (__GNUC__) && ! defined (NO_ASM) \ + && defined (__ia64) && W_TYPE_SIZE == 64 +#define umul2low(ph, pl, uh, ul, vh, vl) \ + do { \ + mp_limb_t _ph, _pl; \ + __asm__ ("xma.hu %0 = %3, %5, f0\n\t" \ + "xma.l %1 = %3, %5, f0\n\t" \ + ";;\n\t" \ + "xma.l %0 = %3, %4, %0\n\t" \ + ";;\n\t" \ + "xma.l %0 = %2, %5, %0" \ + : "=&f" (ph), "=&f" (pl) \ + : "f" (uh), "f" (ul), "f" (vh), "f" (vl)); \ + } while (0) +#endif + +#ifndef umul2low +#define umul2low(ph, pl, uh, ul, vh, vl) \ + do { \ + mp_limb_t _ph, _pl; \ + umul_ppmm (_ph, _pl, ul, vl); \ + (ph) = _ph + (ul) * (vh) + (uh) * (vl); \ + (pl) = _pl; \ + } while (0) +#endif + +mp_limb_t +mpn_redc_2 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr mip) +{ + mp_limb_t q[2]; + mp_size_t j; + mp_limb_t upn; + mp_limb_t cy; + + ASSERT (n > 0); + ASSERT_MPN (up, 2*n); + + if ((n & 1) != 0) + { + up[0] = mpn_addmul_1 (up, mp, n, (up[0] * mip[0]) & GMP_NUMB_MASK); + up++; + } + + for (j = n - 2; j >= 0; j -= 2) + { + umul2low (q[1], q[0], mip[1], mip[0], up[1], up[0]); + upn = up[n]; /* mpn_addmul_2 overwrites this */ + up[1] = mpn_addmul_2 (up, mp, n, q); + up[0] = up[n]; + up[n] = upn; + up += 2; + } + + cy = mpn_add_n (rp, up, up - n, n); + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/redc_n.c b/gmp-6.3.0/mpn/generic/redc_n.c new file mode 100644 index 0000000..0c94b7c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/redc_n.c @@ -0,0 +1,80 @@ +/* mpn_redc_n. Set rp[] <- up[]/R^n mod mp[]. Clobber up[]. + mp[] is n limbs; up[] is 2n limbs, the inverse ip[] is n limbs. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + TODO + + * We assume mpn_mulmod_bnm1 is always faster than plain mpn_mul_n (or a + future mpn_mulhi) for the range we will be called. Follow up that + assumption. + + * Decrease scratch usage. + + * Consider removing the residue canonicalisation. +*/ + +void +mpn_redc_n (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr ip) +{ + mp_ptr xp, yp, scratch; + mp_limb_t cy; + mp_size_t rn; + TMP_DECL; + TMP_MARK; + + ASSERT (n > 8); + + rn = mpn_mulmod_bnm1_next_size (n); + + scratch = TMP_ALLOC_LIMBS (n + rn + mpn_mulmod_bnm1_itch (rn, n, n)); + + xp = scratch; + mpn_mullo_n (xp, up, ip, n); + + yp = scratch + n; + mpn_mulmod_bnm1 (yp, rn, xp, n, mp, n, scratch + n + rn); + + ASSERT_ALWAYS (2 * n > rn); /* could handle this */ + + cy = mpn_sub_n (yp + rn, yp, up, 2*n - rn); /* undo wrap around */ + MPN_DECR_U (yp + 2*n - rn, rn, cy); + + cy = mpn_sub_n (rp, up + n, yp + n, n); + if (cy != 0) + mpn_add_n (rp, rp, mp, n); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/remove.c b/gmp-6.3.0/mpn/generic/remove.c new file mode 100644 index 0000000..cbb0742 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/remove.c @@ -0,0 +1,182 @@ +/* mpn_remove -- divide out all multiples of odd mpn number from another mpn + number. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2009, 2012-2014, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if GMP_LIMB_BITS > 50 +#define LOG 50 +#else +#define LOG GMP_LIMB_BITS +#endif + + +/* Input: U = {up,un}, V = {vp,vn} must be odd, cap + Ouput W = {wp,*wn} allocation need is exactly *wn + + Set W = U / V^k, where k is the largest integer <= cap such that the + division yields an integer. + + FIXME: We currently allow any operand overlap. This is quite non mpn-ish + and might be changed, since it cost significant temporary space. + * If we require W to have space for un + 1 limbs, we could save qp or qp2 + (but we will still need to copy things into wp 50% of the time). + * If we allow ourselves to clobber U, we could save the other of qp and qp2, + and the initial COPY (but also here we would need un + 1 limbs). +*/ + +/* FIXME: We need to wrap mpn_bdiv_qr due to the itch interface. This need + indicates a flaw in the current itch mechanism: Which operands not greater + than un,un will incur the worst itch? We need a parallel foo_maxitch set + of functions. */ +static void +mpn_bdiv_qr_wrap (mp_ptr qp, mp_ptr rp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) +{ + mp_ptr scratch_out; + TMP_DECL; + + TMP_MARK; + scratch_out = TMP_ALLOC_LIMBS (mpn_bdiv_qr_itch (nn, dn)); + mpn_bdiv_qr (qp, rp, np, nn, dp, dn, scratch_out); + + TMP_FREE; +} + +mp_bitcnt_t +mpn_remove (mp_ptr wp, mp_size_t *wn, + mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn, + mp_bitcnt_t cap) +{ + mp_srcptr pwpsp[LOG]; + mp_size_t pwpsn[LOG]; + mp_size_t npowers; + mp_ptr tp, qp, np, qp2; + mp_srcptr pp; + mp_size_t pn, nn, qn, i; + mp_bitcnt_t pwr; + TMP_DECL; + + ASSERT (un > 0); + ASSERT (vn > 0); + ASSERT (vp[0] % 2 != 0); /* 2-adic division wants odd numbers */ + ASSERT (vn > 1 || vp[0] > 1); /* else we would loop indefinitely */ + + TMP_MARK; + + TMP_ALLOC_LIMBS_3 (qp, un + 1, /* quotient, alternating */ + qp2, un + 1, /* quotient, alternating */ + tp, (un + 1 + vn) / 2); /* remainder */ + pp = vp; + pn = vn; + + MPN_COPY (qp, up, un); + qn = un; + + npowers = 0; + while (qn >= pn) + { + qp[qn] = 0; + mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pp, pn); + if (!mpn_zero_p (tp, pn)) + { + if (mpn_cmp (tp, pp, pn) != 0) + break; /* could not divide by V^npowers */ + } + + MP_PTR_SWAP (qp, qp2); + qn = qn - pn; + mpn_neg (qp, qp, qn+1); + + qn += qp[qn] != 0; + + pwpsp[npowers] = pp; + pwpsn[npowers] = pn; + ++npowers; + + if (((mp_bitcnt_t) 2 << npowers) - 1 > cap) + break; + + nn = 2 * pn - 1; /* next power will be at least this large */ + if (nn > qn) + break; /* next power would be overlarge */ + + if (npowers == 1) /* Alloc once, but only if it's needed */ + np = TMP_ALLOC_LIMBS (qn + LOG); /* powers of V */ + else + np += pn; + + mpn_sqr (np, pp, pn); + pn = nn + (np[nn] != 0); + pp = np; + } + + pwr = ((mp_bitcnt_t) 1 << npowers) - 1; + + for (i = npowers; --i >= 0;) + { + pn = pwpsn[i]; + if (qn < pn) + continue; + + if (pwr + ((mp_bitcnt_t) 1 << i) > cap) + continue; /* V^i would bring us past cap */ + + qp[qn] = 0; + mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pwpsp[i], pn); + if (!mpn_zero_p (tp, pn)) + { + if (mpn_cmp (tp, pwpsp[i], pn) != 0) + continue; /* could not divide by V^i */ + } + + MP_PTR_SWAP (qp, qp2); + qn = qn - pn; + mpn_neg (qp, qp, qn+1); + + qn += qp[qn] != 0; + + pwr += (mp_bitcnt_t) 1 << i; + } + + MPN_COPY (wp, qp, qn); + *wn = qn; + + TMP_FREE; + + return pwr; +} diff --git a/gmp-6.3.0/mpn/generic/rootrem.c b/gmp-6.3.0/mpn/generic/rootrem.c new file mode 100644 index 0000000..a79099e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/rootrem.c @@ -0,0 +1,515 @@ +/* mpn_rootrem(rootp,remp,ap,an,nth) -- Compute the nth root of {ap,an}, and + store the truncated integer part at rootp and the remainder at remp. + + Contributed by Paul Zimmermann (algorithm) and + Paul Zimmermann and Torbjorn Granlund (implementation). + Marco Bodrato wrote logbased_root to seed the loop. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL, AND HAVE MUTABLE INTERFACES. IT'S + ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT'S ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2002, 2005, 2009-2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* FIXME: + This implementation is not optimal when remp == NULL, since the complexity + is M(n), whereas it should be M(n/k) on average. +*/ + +#include /* for NULL */ + +#include "gmp-impl.h" +#include "longlong.h" + +static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, + mp_limb_t, int); + +#define MPN_RSHIFT(rp,up,un,cnt) \ + do { \ + if ((cnt) != 0) \ + mpn_rshift (rp, up, un, cnt); \ + else \ + { \ + MPN_COPY_INCR (rp, up, un); \ + } \ + } while (0) + +#define MPN_LSHIFT(cy,rp,up,un,cnt) \ + do { \ + if ((cnt) != 0) \ + cy = mpn_lshift (rp, up, un, cnt); \ + else \ + { \ + MPN_COPY_DECR (rp, up, un); \ + cy = 0; \ + } \ + } while (0) + + +/* Put in {rootp, ceil(un/k)} the kth root of {up, un}, rounded toward zero. + If remp <> NULL, put in {remp, un} the remainder. + Return the size (in limbs) of the remainder if remp <> NULL, + or a non-zero value iff the remainder is non-zero when remp = NULL. + Assumes: + (a) up[un-1] is not zero + (b) rootp has at least space for ceil(un/k) limbs + (c) remp has at least space for un limbs (in case remp <> NULL) + (d) the operands do not overlap. + + The auxiliary memory usage is 3*un+2 if remp = NULL, + and 2*un+2 if remp <> NULL. FIXME: This is an incorrect comment. +*/ +mp_size_t +mpn_rootrem (mp_ptr rootp, mp_ptr remp, + mp_srcptr up, mp_size_t un, mp_limb_t k) +{ + ASSERT (un > 0); + ASSERT (up[un - 1] != 0); + ASSERT (k > 1); + + if (UNLIKELY (k == 2)) + return mpn_sqrtrem (rootp, remp, up, un); + /* (un-1)/k > 2 <=> un > 3k <=> (un + 2)/3 > k */ + if (remp == NULL && (un + 2) / 3 > k) + /* Pad {up,un} with k zero limbs. This will produce an approximate root + with one more limb, allowing us to compute the exact integral result. */ + { + mp_ptr sp, wp; + mp_size_t rn, sn, wn; + TMP_DECL; + TMP_MARK; + wn = un + k; + sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */ + TMP_ALLOC_LIMBS_2 (wp, wn, /* will contain the padded input */ + sp, sn); /* approximate root of padded input */ + MPN_COPY (wp + k, up, un); + MPN_FILL (wp, k, 0); + rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1); + /* The approximate root S = {sp,sn} is either the correct root of + {sp,sn}, or 1 too large. Thus unless the least significant limb of + S is 0 or 1, we can deduce the root of {up,un} is S truncated by one + limb. (In case sp[0]=1, we can deduce the root, but not decide + whether it is exact or not.) */ + MPN_COPY (rootp, sp + 1, sn - 1); + TMP_FREE; + return rn; + } + else + { + return mpn_rootrem_internal (rootp, remp, up, un, k, 0); + } +} + +#define LOGROOT_USED_BITS 8 +#define LOGROOT_NEEDS_TWO_CORRECTIONS 1 +#define LOGROOT_RETURNED_BITS (LOGROOT_USED_BITS + LOGROOT_NEEDS_TWO_CORRECTIONS) +/* Puts in *rootp some bits of the k^nt root of the number + 2^bitn * 1.op ; where op represents the "fractional" bits. + + The returned value is the number of bits of the root minus one; + i.e. an approximation of the root will be + (*rootp) * 2^(retval-LOGROOT_RETURNED_BITS+1). + + Currently, only LOGROOT_USED_BITS bits of op are used (the implicit + one is not counted). + */ +static unsigned +logbased_root (mp_ptr rootp, mp_limb_t op, mp_bitcnt_t bitn, mp_limb_t k) +{ + /* vlog=vector(256,i,floor((log(256+i)/log(2)-8)*256)-(i>255)) */ + static const + unsigned char vlog[] = {1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, + 23, 25, 26, 27, 29, 30, 31, 33, 34, 35, 37, 38, 39, 40, 42, 43, + 44, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 61, 62, 63, + 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 80, 81, 82, + 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, + 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 181, 182, 183, 184, 185, 186, 187, 188, 188, 189, 190, 191, 192, 193, + 194, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 205, 206, + 207, 208, 209, 209, 210, 211, 212, 213, 214, 214, 215, 216, 217, 218, 218, 219, + 220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 228, 229, 229, 230, 231, 232, + 232, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244, + 245, 245, 246, 247, 247, 248, 249, 250, 250, 251, 252, 253, 253, 254, 255, 255}; + + /* vexp=vector(256,i,floor(2^(8+i/256)-256)-(i>255)) */ + static const + unsigned char vexp[] = {0, 1, 2, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 9, 10, 11, + 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, + 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, + 36, 37, 37, 38, 39, 40, 41, 41, 42, 43, 44, 45, 45, 46, 47, 48, + 49, 50, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, + 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, + 107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122, + 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, + 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, + 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174, + 175, 176, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 191, 192, 193, + 194, 196, 197, 198, 199, 200, 202, 203, 204, 205, 207, 208, 209, 210, 212, 213, + 214, 216, 217, 218, 219, 221, 222, 223, 225, 226, 227, 229, 230, 231, 232, 234, + 235, 236, 238, 239, 240, 242, 243, 245, 246, 247, 249, 250, 251, 253, 254, 255}; + mp_bitcnt_t retval; + + if (UNLIKELY (bitn > (~ (mp_bitcnt_t) 0) >> LOGROOT_USED_BITS)) + { + /* In the unlikely case, we use two divisions and a modulo. */ + retval = bitn / k; + bitn %= k; + bitn = (bitn << LOGROOT_USED_BITS | + vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k; + } + else + { + bitn = (bitn << LOGROOT_USED_BITS | + vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k; + retval = bitn >> LOGROOT_USED_BITS; + bitn &= (CNST_LIMB (1) << LOGROOT_USED_BITS) - 1; + } + ASSERT(bitn < CNST_LIMB (1) << LOGROOT_USED_BITS); + *rootp = CNST_LIMB(1) << (LOGROOT_USED_BITS - ! LOGROOT_NEEDS_TWO_CORRECTIONS) + | vexp[bitn] >> ! LOGROOT_NEEDS_TWO_CORRECTIONS; + return retval; +} + +/* if approx is non-zero, does not compute the final remainder */ +static mp_size_t +mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un, + mp_limb_t k, int approx) +{ + mp_ptr qp, rp, sp, wp, scratch; + mp_size_t qn, rn, sn, wn, nl, bn; + mp_limb_t save, save2, cy, uh; + mp_bitcnt_t unb; /* number of significant bits of {up,un} */ + mp_bitcnt_t xnb; /* number of significant bits of the result */ + mp_bitcnt_t b, kk; + mp_bitcnt_t sizes[GMP_NUMB_BITS + 1]; + int ni; + int perf_pow; + unsigned ulz, snb, c, logk; + TMP_DECL; + + /* MPN_SIZEINBASE_2EXP(unb, up, un, 1); --unb; */ + uh = up[un - 1]; + count_leading_zeros (ulz, uh); + ulz = ulz - GMP_NAIL_BITS + 1; /* Ignore the first 1. */ + unb = (mp_bitcnt_t) un * GMP_NUMB_BITS - ulz; + /* unb is the (truncated) logarithm of the input U in base 2*/ + + if (unb < k) /* root is 1 */ + { + rootp[0] = 1; + if (remp == NULL) + un -= (*up == CNST_LIMB (1)); /* Non-zero iif {up,un} > 1 */ + else + { + mpn_sub_1 (remp, up, un, CNST_LIMB (1)); + un -= (remp [un - 1] == 0); /* There should be at most one zero limb, + if we demand u to be normalized */ + } + return un; + } + /* if (unb - k < k/2 + k/16) // root is 2 */ + + if (ulz == GMP_NUMB_BITS) + uh = up[un - 2]; + else + uh = (uh << ulz & GMP_NUMB_MASK) | up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz); + ASSERT (un != 1 || up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz) == 1); + + xnb = logbased_root (rootp, uh, unb, k); + snb = LOGROOT_RETURNED_BITS - 1; + /* xnb+1 is the number of bits of the root R */ + /* snb+1 is the number of bits of the current approximation S */ + + kk = k * xnb; /* number of truncated bits in the input */ + + /* FIXME: Should we skip the next two loops when xnb <= snb ? */ + for (uh = (k - 1) / 2, logk = 3; (uh >>= 1) != 0; ++logk ) + ; + /* logk = ceil(log(k)/log(2)) + 1 */ + + /* xnb is the number of remaining bits to determine in the kth root */ + for (ni = 0; (sizes[ni] = xnb) > snb; ++ni) + { + /* invariant: here we want xnb+1 total bits for the kth root */ + + /* if c is the new value of xnb, this means that we'll go from a + root of c+1 bits (say s') to a root of xnb+1 bits. + It is proved in the book "Modern Computer Arithmetic" by Brent + and Zimmermann, Chapter 1, that + if s' >= k*beta, then at most one correction is necessary. + Here beta = 2^(xnb-c), and s' >= 2^c, thus it suffices that + c >= ceil((xnb + log2(k))/2). */ + if (xnb > logk) + xnb = (xnb + logk) / 2; + else + --xnb; /* add just one bit at a time */ + } + + *rootp >>= snb - xnb; + kk -= xnb; + + ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1); + /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with + sizes[i] <= 2 * sizes[i+1]. + Newton iteration will first compute sizes[ni-1] extra bits, + then sizes[ni-2], ..., then sizes[0] = b. */ + + TMP_MARK; + /* qp and wp need enough space to store S'^k where S' is an approximate + root. Since S' can be as large as S+2, the worst case is when S=2 and + S'=4. But then since we know the number of bits of S in advance, S' + can only be 3 at most. Similarly for S=4, then S' can be 6 at most. + So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k + fits in un limbs, the number of extra limbs needed is bounded by + ceil(k*log2(3/2)/GMP_NUMB_BITS). */ + /* THINK: with the use of logbased_root, maybe the constant is + 258/256 instead of 3/2 ? log2(258/256) < 1/89 < 1/64 */ +#define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS) + TMP_ALLOC_LIMBS_3 (scratch, un + 1, /* used by mpn_div_q */ + qp, un + EXTRA, /* will contain quotient and remainder + of R/(k*S^(k-1)), and S^k */ + wp, un + EXTRA); /* will contain S^(k-1), k*S^(k-1), + and temporary for mpn_pow_1 */ + + if (remp == NULL) + rp = scratch; /* will contain the remainder */ + else + rp = remp; + sp = rootp; + + sn = 1; /* Initial approximation has one limb */ + + for (b = xnb; ni != 0; --ni) + { + /* 1: loop invariant: + {sp, sn} is the current approximation of the root, which has + exactly 1 + sizes[ni] bits. + {rp, rn} is the current remainder + {wp, wn} = {sp, sn}^(k-1) + kk = number of truncated bits of the input + */ + + /* Since each iteration treats b bits from the root and thus k*b bits + from the input, and we already considered b bits from the input, + we now have to take another (k-1)*b bits from the input. */ + kk -= (k - 1) * b; /* remaining input bits */ + /* {rp, rn} = floor({up, un} / 2^kk) */ + rn = un - kk / GMP_NUMB_BITS; + MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS); + rn -= rp[rn - 1] == 0; + + /* 9: current buffers: {sp,sn}, {rp,rn} */ + + for (c = 0;; c++) + { + /* Compute S^k in {qp,qn}. */ + /* W <- S^(k-1) for the next iteration, + and S^k = W * S. */ + wn = mpn_pow_1 (wp, sp, sn, k - 1, qp); + mpn_mul (qp, wp, wn, sp, sn); + qn = wn + sn; + qn -= qp[qn - 1] == 0; + + perf_pow = 1; + /* if S^k > floor(U/2^kk), the root approximation was too large */ + if (qn > rn || (qn == rn && (perf_pow=mpn_cmp (qp, rp, rn)) > 0)) + MPN_DECR_U (sp, sn, 1); + else + break; + } + + /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */ + + /* sometimes two corrections are needed with logbased_root*/ + ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS); + ASSERT_ALWAYS (rn >= qn); + + b = sizes[ni - 1] - sizes[ni]; /* number of bits to compute in the + next iteration */ + bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[], after shift */ + + kk = kk - b; + /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */ + nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS); + /* nl = 1 + floor((kk + b - 1) / GMP_NUMB_BITS) + - floor(kk / GMP_NUMB_BITS) + <= 1 + (kk + b - 1) / GMP_NUMB_BITS + - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS + = 2 + (b - 2) / GMP_NUMB_BITS + thus since nl is an integer: + nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */ + + /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* R = R - Q = floor(U/2^kk) - S^k */ + if (perf_pow != 0) + { + mpn_sub (rp, rp, rn, qp, qn); + MPN_NORMALIZE_NOT_ZERO (rp, rn); + + /* first multiply the remainder by 2^b */ + MPN_LSHIFT (cy, rp + bn, rp, rn, b % GMP_NUMB_BITS); + rn = rn + bn; + if (cy != 0) + { + rp[rn] = cy; + rn++; + } + + save = rp[bn]; + /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */ + if (nl - 1 > bn) + save2 = rp[bn + 1]; + } + else + { + rn = bn; + save2 = save = 0; + } + /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* Now insert bits [kk,kk+b-1] from the input U */ + MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS); + /* set to zero high bits of rp[bn] */ + rp[bn] &= (CNST_LIMB (1) << (b % GMP_NUMB_BITS)) - 1; + /* restore corresponding bits */ + rp[bn] |= save; + if (nl - 1 > bn) + rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since + they start by bit 0 in rp[0], so they use + at most ceil(b/GMP_NUMB_BITS) limbs */ + /* FIXME: Should we normalise {rp,rn} here ?*/ + + /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* compute {wp, wn} = k * {sp, sn}^(k-1) */ + cy = mpn_mul_1 (wp, wp, wn, k); + wp[wn] = cy; + wn += cy != 0; + + /* 6: current buffers: {sp,sn}, {qp,qn} */ + + /* multiply the root approximation by 2^b */ + MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS); + sn = sn + b / GMP_NUMB_BITS; + if (cy != 0) + { + sp[sn] = cy; + sn++; + } + + save = sp[b / GMP_NUMB_BITS]; + + /* Number of limbs used by b bits, when least significant bit is + aligned to least limb */ + bn = (b - 1) / GMP_NUMB_BITS + 1; + + /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* now divide {rp, rn} by {wp, wn} to get the low part of the root */ + if (UNLIKELY (rn < wn)) + { + MPN_FILL (sp, bn, 0); + } + else + { + qn = rn - wn; /* expected quotient size */ + if (qn <= bn) { /* Divide only if result is not too big. */ + mpn_div_q (qp, rp, rn, wp, wn, scratch); + qn += qp[qn] != 0; + } + + /* 5: current buffers: {sp,sn}, {qp,qn}. + Note: {rp,rn} is not needed any more since we'll compute it from + scratch at the end of the loop. + */ + + /* the quotient should be smaller than 2^b, since the previous + approximation was correctly rounded toward zero */ + if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) && + qp[qn - 1] >= (CNST_LIMB (1) << (b % GMP_NUMB_BITS)))) + { + for (qn = 1; qn < bn; ++qn) + sp[qn - 1] = GMP_NUMB_MAX; + sp[qn - 1] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - 1 - ((b - 1) % GMP_NUMB_BITS)); + } + else + { + /* 7: current buffers: {sp,sn}, {qp,qn} */ + + /* Combine sB and q to form sB + q. */ + MPN_COPY (sp, qp, qn); + MPN_ZERO (sp + qn, bn - qn); + } + } + sp[b / GMP_NUMB_BITS] |= save; + + /* 8: current buffer: {sp,sn} */ + + } + + /* otherwise we have rn > 0, thus the return value is ok */ + if (!approx || sp[0] <= CNST_LIMB (1)) + { + for (c = 0;; c++) + { + /* Compute S^k in {qp,qn}. */ + /* Last iteration: we don't need W anymore. */ + /* mpn_pow_1 requires that both qp and wp have enough + space to store the result {sp,sn}^k + 1 limb */ + qn = mpn_pow_1 (qp, sp, sn, k, wp); + + perf_pow = 1; + if (qn > un || (qn == un && (perf_pow=mpn_cmp (qp, up, un)) > 0)) + MPN_DECR_U (sp, sn, 1); + else + break; + }; + + /* sometimes two corrections are needed with logbased_root*/ + ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS); + + rn = perf_pow != 0; + if (rn != 0 && remp != NULL) + { + mpn_sub (remp, up, un, qp, qn); + rn = un; + MPN_NORMALIZE_NOT_ZERO (remp, rn); + } + } + + TMP_FREE; + return rn; +} diff --git a/gmp-6.3.0/mpn/generic/rshift.c b/gmp-6.3.0/mpn/generic/rshift.c new file mode 100644 index 0000000..15d427d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/rshift.c @@ -0,0 +1,69 @@ +/* mpn_rshift -- Shift right low level. + +Copyright 1991, 1993, 1994, 1996, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Shift U (pointed to by up and N limbs long) cnt bits to the right + and store the n least significant limbs of the result at rp. + The bits shifted out to the right are returned. + + Argument constraints: + 1. 0 < cnt < GMP_NUMB_BITS. + 2. If the result is to be written over the input, rp must be <= up. +*/ + +mp_limb_t +mpn_rshift (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + mp_limb_t high_limb, low_limb; + unsigned int tnc; + mp_size_t i; + mp_limb_t retval; + + ASSERT (n >= 1); + ASSERT (cnt >= 1); + ASSERT (cnt < GMP_NUMB_BITS); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + + tnc = GMP_NUMB_BITS - cnt; + high_limb = *up++; + retval = (high_limb << tnc) & GMP_NUMB_MASK; + low_limb = high_limb >> cnt; + + for (i = n - 1; i != 0; i--) + { + high_limb = *up++; + *rp++ = low_limb | ((high_limb << tnc) & GMP_NUMB_MASK); + low_limb = high_limb >> cnt; + } + *rp = low_limb; + + return retval; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c new file mode 100644 index 0000000..850e593 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c @@ -0,0 +1,96 @@ +/* mpn_sbpi1_bdiv_q -- schoolbook Hensel division with precomputed inverse, + returning quotient only. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. + IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005, 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Computes Q = - U / D mod B^un, destroys U. + + D must be odd. dinv is (-D)^-1 mod B. + +*/ + +void +mpn_sbpi1_bdiv_q (mp_ptr qp, + mp_ptr up, mp_size_t un, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t q; + + ASSERT (dn > 0); + ASSERT (un >= dn); + ASSERT ((dp[0] & 1) != 0); + ASSERT (-(dp[0] * dinv) == 1); + ASSERT (up == qp || !MPN_OVERLAP_P (up, un, qp, un - dn)); + + if (un > dn) + { + mp_limb_t cy, hi; + for (i = un - dn - 1, cy = 0; i > 0; i--) + { + q = dinv * up[0]; + hi = mpn_addmul_1 (up, dp, dn, q); + + ASSERT (up[0] == 0); + *qp++ = q; + hi += cy; + cy = hi < cy; + hi += up[dn]; + cy += hi < up[dn]; + up[dn] = hi; + up++; + } + q = dinv * up[0]; + hi = cy + mpn_addmul_1 (up, dp, dn, q); + ASSERT (up[0] == 0); + *qp++ = q; + up[dn] += hi; + up++; + } + for (i = dn; i > 1; i--) + { + mp_limb_t q = dinv * up[0]; + mpn_addmul_1 (up, dp, i, q); + ASSERT (up[0] == 0); + *qp++ = q; + up++; + } + + /* Final limb */ + *qp = dinv * up[0]; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c new file mode 100644 index 0000000..6146c45 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c @@ -0,0 +1,82 @@ +/* mpn_sbpi1_bdiv_qr -- schoolbook Hensel division with precomputed inverse, + returning quotient and remainder. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. + IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes a binary quotient of size qn = un - dn. + Output: + + Q = -U * D^{-1} mod B^qn, + + R = (U + Q * D) * B^(-qn) + + Stores the dn least significant limbs of R at {up + un - dn, dn}, + and returns the carry from the addition N + Q*D. + + D must be odd. dinv is (-D)^-1 mod B. */ + +mp_limb_t +mpn_sbpi1_bdiv_qr (mp_ptr qp, + mp_ptr up, mp_size_t un, + mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t cy; + + ASSERT (dn > 0); + ASSERT (un > dn); + ASSERT ((dp[0] & 1) != 0); + ASSERT (-(dp[0] * dinv) == 1); + ASSERT (up == qp || !MPN_OVERLAP_P (up, un, qp, un - dn)); + + for (i = un - dn, cy = 0; i != 0; i--) + { + mp_limb_t q = dinv * up[0]; + mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q); + *qp++ = q; + + hi += cy; + cy = hi < cy; + hi += up[dn]; + cy += hi < up[dn]; + up[dn] = hi; + up++; + } + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c new file mode 100644 index 0000000..a609951 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c @@ -0,0 +1,79 @@ +/* mpn_sbpi1_bdiv_r -- schoolbook Hensel division with precomputed inverse, + returning remainder. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. + IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes a binary quotient of size qn = un - dn. + Output: + + Q = -U * D^{-1} mod B^qn, + + R = (U + Q * D) * B^(-qn) + + Stores the dn least significant limbs of R at {up + un - dn, dn}, + and returns the carry from the addition N + Q*D. + + D must be odd. dinv is (-D)^-1 mod B. */ + +mp_limb_t +mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un, + mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t cy; + + ASSERT (dn > 0); + ASSERT (un > dn); + ASSERT ((dp[0] & 1) != 0); + ASSERT (-(dp[0] * dinv) == 1); + + for (i = un - dn, cy = 0; i != 0; i--) + { + mp_limb_t q = dinv * up[0]; + mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q); + + hi += cy; + cy = hi < cy; + hi += up[dn]; + cy += hi < up[dn]; + up[dn] = hi; + up++; + } + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_div_q.c b/gmp-6.3.0/mpn/generic/sbpi1_div_q.c new file mode 100644 index 0000000..a9975eb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_div_q.c @@ -0,0 +1,302 @@ +/* mpn_sbpi1_div_q -- Schoolbook division using the Möller-Granlund 3/2 + division algorithm. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_sbpi1_div_q (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_limb_t qh; + mp_size_t qn, i; + mp_limb_t n1, n0; + mp_limb_t d1, d0; + mp_limb_t cy, cy1; + mp_limb_t q; + mp_limb_t flag; + + mp_size_t dn_orig = dn; + mp_srcptr dp_orig = dp; + mp_ptr np_orig = np; + + ASSERT (dn > 2); + ASSERT (nn >= dn); + ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); + + np += nn; + + qn = nn - dn; + if (qn + 1 < dn) + { + dp += dn - (qn + 1); + dn = qn + 1; + } + + qh = mpn_cmp (np - dn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (np - dn, np - dn, dp, dn); + + qp += qn; + + dn -= 2; /* offset dn by 2 for main division loops, + saving two iterations in mpn_submul_1. */ + d1 = dp[dn + 1]; + d0 = dp[dn + 0]; + + np -= 2; + + n1 = np[1]; + + for (i = qn - (dn + 2); i >= 0; i--) + { + np--; + if (UNLIKELY (n1 == d1) && np[1] == d0) + { + q = GMP_NUMB_MASK; + mpn_submul_1 (np - dn, dp, dn + 2, q); + n1 = np[1]; /* update n1, last loop's value will now be invalid */ + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 -= cy1; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + } + + flag = ~CNST_LIMB(0); + + if (dn >= 0) + { + for (i = dn; i > 0; i--) + { + np--; + if (UNLIKELY (n1 >= (d1 & flag))) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np - dn, dp, dn + 2, q); + + if (UNLIKELY (n1 != cy)) + { + if (n1 < (cy & flag)) + { + q--; + mpn_add_n (np - dn, np - dn, dp, dn + 2); + } + else + flag = 0; + } + n1 = np[1]; + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 -= cy1; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + + /* Truncate operands. */ + dn--; + dp++; + } + + np--; + if (UNLIKELY (n1 >= (d1 & flag))) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np, dp, 2, q); + + if (UNLIKELY (n1 != cy)) + { + if (n1 < (cy & flag)) + { + q--; + add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]); + } + else + flag = 0; + } + n1 = np[1]; + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + np[0] = n0; + np[1] = n1; + } + + *--qp = q; + } + ASSERT_ALWAYS (np[1] == n1); + np += 2; + + + dn = dn_orig; + if (UNLIKELY (n1 < (dn & flag))) + { + mp_limb_t q, x; + + /* The quotient may be too large if the remainder is small. Recompute + for above ignored operand parts, until the remainder spills. + + FIXME: The quality of this code isn't the same as the code above. + 1. We don't compute things in an optimal order, high-to-low, in order + to terminate as quickly as possible. + 2. We mess with pointers and sizes, adding and subtracting and + adjusting to get things right. It surely could be streamlined. + 3. The only termination criteria are that we determine that the + quotient needs to be adjusted, or that we have recomputed + everything. We should stop when the remainder is so large + that no additional subtracting could make it spill. + 4. If nothing else, we should not do two loops of submul_1 over the + data, instead handle both the triangularization and chopping at + once. */ + + x = n1; + + if (dn > 2) + { + /* Compensate for triangularization. */ + mp_limb_t y; + + dp = dp_orig; + if (qn + 1 < dn) + { + dp += dn - (qn + 1); + dn = qn + 1; + } + + y = np[-2]; + + for (i = dn - 3; i >= 0; i--) + { + q = qp[i]; + cy = mpn_submul_1 (np - (dn - i), dp, dn - i - 2, q); + + if (y < cy) + { + if (x == 0) + { + cy = mpn_sub_1 (qp, qp, qn, 1); + ASSERT_ALWAYS (cy == 0); + return qh - cy; + } + x--; + } + y -= cy; + } + np[-2] = y; + } + + dn = dn_orig; + if (qn + 1 < dn) + { + /* Compensate for ignored dividend and divisor tails. */ + + dp = dp_orig; + np = np_orig; + + if (qh != 0) + { + cy = mpn_sub_n (np + qn, np + qn, dp, dn - (qn + 1)); + if (cy != 0) + { + if (x == 0) + { + if (qn != 0) + cy = mpn_sub_1 (qp, qp, qn, 1); + return qh - cy; + } + x--; + } + } + + if (qn == 0) + return qh; + + for (i = dn - qn - 2; i >= 0; i--) + { + cy = mpn_submul_1 (np + i, qp, qn, dp[i]); + cy = mpn_sub_1 (np + qn + i, np + qn + i, dn - qn - i - 1, cy); + if (cy != 0) + { + if (x == 0) + { + cy = mpn_sub_1 (qp, qp, qn, 1); + return qh; + } + x--; + } + } + } + } + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_div_qr.c b/gmp-6.3.0/mpn/generic/sbpi1_div_qr.c new file mode 100644 index 0000000..7330a77 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_div_qr.c @@ -0,0 +1,109 @@ +/* mpn_sbpi1_div_qr -- Schoolbook division using the Möller-Granlund 3/2 + division algorithm. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_sbpi1_div_qr (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_limb_t qh; + mp_size_t i; + mp_limb_t n1, n0; + mp_limb_t d1, d0; + mp_limb_t cy, cy1; + mp_limb_t q; + + ASSERT (dn > 2); + ASSERT (nn >= dn); + ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); + + np += nn; + + qh = mpn_cmp (np - dn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (np - dn, np - dn, dp, dn); + + qp += nn - dn; + + dn -= 2; /* offset dn by 2 for main division loops, + saving two iterations in mpn_submul_1. */ + d1 = dp[dn + 1]; + d0 = dp[dn + 0]; + + np -= 2; + + n1 = np[1]; + + for (i = nn - (dn + 2); i > 0; i--) + { + np--; + if (UNLIKELY (n1 == d1) && np[1] == d0) + { + q = GMP_NUMB_MASK; + mpn_submul_1 (np - dn, dp, dn + 2, q); + n1 = np[1]; /* update n1, last loop's value will now be invalid */ + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 = (n1 - cy1) & GMP_NUMB_MASK; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + } + np[1] = n1; + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c b/gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c new file mode 100644 index 0000000..ef7ca26 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c @@ -0,0 +1,198 @@ +/* mpn_sbpi1_divappr_q -- Schoolbook division using the Möller-Granlund 3/2 + division algorithm, returning approximate quotient. The quotient returned + is either correct, or one too large. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_sbpi1_divappr_q (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_limb_t qh; + mp_size_t qn, i; + mp_limb_t n1, n0; + mp_limb_t d1, d0; + mp_limb_t cy, cy1; + mp_limb_t q; + mp_limb_t flag; + + ASSERT (dn > 2); + ASSERT (nn >= dn); + ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); + + np += nn; + + qn = nn - dn; + if (qn + 1 < dn) + { + dp += dn - (qn + 1); + dn = qn + 1; + } + + qh = mpn_cmp (np - dn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (np - dn, np - dn, dp, dn); + + qp += qn; + + dn -= 2; /* offset dn by 2 for main division loops, + saving two iterations in mpn_submul_1. */ + d1 = dp[dn + 1]; + d0 = dp[dn + 0]; + + np -= 2; + + n1 = np[1]; + + for (i = qn - (dn + 2); i >= 0; i--) + { + np--; + if (UNLIKELY (n1 == d1) && np[1] == d0) + { + q = GMP_NUMB_MASK; + mpn_submul_1 (np - dn, dp, dn + 2, q); + n1 = np[1]; /* update n1, last loop's value will now be invalid */ + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 -= cy1; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + } + + flag = ~CNST_LIMB(0); + + if (dn >= 0) + { + for (i = dn; i > 0; i--) + { + np--; + if (UNLIKELY (n1 >= (d1 & flag))) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np - dn, dp, dn + 2, q); + + if (UNLIKELY (n1 != cy)) + { + if (n1 < (cy & flag)) + { + q--; + mpn_add_n (np - dn, np - dn, dp, dn + 2); + } + else + flag = 0; + } + n1 = np[1]; + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 -= cy1; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + + /* Truncate operands. */ + dn--; + dp++; + } + + np--; + if (UNLIKELY (n1 >= (d1 & flag))) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np, dp, 2, q); + + if (UNLIKELY (n1 != cy)) + { + if (n1 < (cy & flag)) + { + q--; + add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]); + } + else + flag = 0; + } + n1 = np[1]; + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + np[1] = n1; + np[0] = n0; + } + + *--qp = q; + } + + ASSERT_ALWAYS (np[1] == n1); + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/scan0.c b/gmp-6.3.0/mpn/generic/scan0.c new file mode 100644 index 0000000..d71832e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/scan0.c @@ -0,0 +1,59 @@ +/* mpn_scan0 -- Scan from a given bit position for the next clear bit. + +Copyright 1994, 1996, 2001, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Argument constraints: + 1. U must sooner or later have a limb with a clear bit. + */ + +mp_bitcnt_t +mpn_scan0 (mp_srcptr up, mp_bitcnt_t starting_bit) +{ + mp_size_t starting_word; + mp_limb_t alimb; + int cnt; + mp_srcptr p; + + /* Start at the word implied by STARTING_BIT. */ + starting_word = starting_bit / GMP_NUMB_BITS; + p = up + starting_word; + alimb = *p++ ^ GMP_NUMB_MASK; + + /* Mask off any bits before STARTING_BIT in the first limb. */ + alimb &= - (mp_limb_t) 1 << (starting_bit % GMP_NUMB_BITS); + + while (alimb == 0) + alimb = *p++ ^ GMP_NUMB_MASK; + + count_trailing_zeros (cnt, alimb); + return (p - up - 1) * GMP_NUMB_BITS + cnt; +} diff --git a/gmp-6.3.0/mpn/generic/scan1.c b/gmp-6.3.0/mpn/generic/scan1.c new file mode 100644 index 0000000..09e8060 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/scan1.c @@ -0,0 +1,59 @@ +/* mpn_scan1 -- Scan from a given bit position for the next set bit. + +Copyright 1994, 1996, 2001, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Argument constraints: + 1. U must sooner or later have a limb != 0. + */ + +mp_bitcnt_t +mpn_scan1 (mp_srcptr up, mp_bitcnt_t starting_bit) +{ + mp_size_t starting_word; + mp_limb_t alimb; + int cnt; + mp_srcptr p; + + /* Start at the word implied by STARTING_BIT. */ + starting_word = starting_bit / GMP_NUMB_BITS; + p = up + starting_word; + alimb = *p++; + + /* Mask off any bits before STARTING_BIT in the first limb. */ + alimb &= - (mp_limb_t) 1 << (starting_bit % GMP_NUMB_BITS); + + while (alimb == 0) + alimb = *p++; + + count_trailing_zeros (cnt, alimb); + return (p - up - 1) * GMP_NUMB_BITS + cnt; +} diff --git a/gmp-6.3.0/mpn/generic/sec_aors_1.c b/gmp-6.3.0/mpn/generic/sec_aors_1.c new file mode 100644 index 0000000..6480fa1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_aors_1.c @@ -0,0 +1,59 @@ +/* mpn_sec_add_1, mpn_sec_sub_1 + + Contributed to the GNU project by Niels Möller + +Copyright 2013, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if OPERATION_sec_add_1 +#define FNAME mpn_sec_add_1 +#define FNAME_itch mpn_sec_add_1_itch +#define OP_N mpn_add_n +#endif +#if OPERATION_sec_sub_1 +#define FNAME mpn_sec_sub_1 +#define FNAME_itch mpn_sec_sub_1_itch +#define OP_N mpn_sub_n +#endif + +/* It's annoying to that we need scratch space */ +mp_size_t +FNAME_itch (mp_size_t n) +{ + return n; +} + +mp_limb_t +FNAME (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_ptr scratch) +{ + scratch[0] = b; + MPN_ZERO (scratch + 1, n-1); + return OP_N (rp, ap, scratch, n); +} diff --git a/gmp-6.3.0/mpn/generic/sec_div.c b/gmp-6.3.0/mpn/generic/sec_div.c new file mode 100644 index 0000000..1f08649 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_div.c @@ -0,0 +1,131 @@ +/* mpn_sec_div_qr, mpn_sec_div_r -- Compute Q = floor(U / V), U = U mod V. + Side-channel silent under the assumption that the used instructions are + side-channel silent. + + Contributed to the GNU project by Torbjörn Granlund. + +Copyright 2011-2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#if OPERATION_sec_div_qr +#define FNAME mpn_sec_div_qr +#define FNAME_itch mpn_sec_div_qr_itch +#define Q(q) q, +#define RETTYPE mp_limb_t +#endif +#if OPERATION_sec_div_r +#define FNAME mpn_sec_div_r +#define FNAME_itch mpn_sec_div_r_itch +#define Q(q) +#define RETTYPE void +#endif + +mp_size_t +FNAME_itch (mp_size_t nn, mp_size_t dn) +{ +#if OPERATION_sec_div_qr +/* Needs (nn + dn + 1) + mpn_sec_pi1_div_qr's needs of (2nn' - dn + 1) for a + total of 3nn + 4 limbs at tp. Note that mpn_sec_pi1_div_qr's nn is one + greater than ours, therefore +4 and not just +2. */ + return 3 * nn + 4; +#endif +#if OPERATION_sec_div_r +/* Needs (nn + dn + 1) + mpn_sec_pi1_div_r's needs of (dn + 1) for a total of + nn + 2dn + 2 limbs at tp. */ + return nn + 2 * dn + 2; +#endif +} + +RETTYPE +FNAME (Q(mp_ptr qp) + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr tp) +{ + mp_limb_t d1, d0; + unsigned int cnt; + mp_limb_t inv32; + + ASSERT (dn >= 1); + ASSERT (nn >= dn); + ASSERT (dp[dn - 1] != 0); + + d1 = dp[dn - 1]; + count_leading_zeros (cnt, d1); + + if (cnt != 0) + { + mp_limb_t qh, cy; + mp_ptr np2, dp2; + dp2 = tp; /* dn limbs */ + mpn_lshift (dp2, dp, dn, cnt); + + np2 = tp + dn; /* (nn + 1) limbs */ + cy = mpn_lshift (np2, np, nn, cnt); + np2[nn++] = cy; + + d0 = dp2[dn - 1]; + d0 += (~d0 != 0); + invert_limb (inv32, d0); + + /* We add nn + dn to tp here, not nn + 1 + dn, as expected. This is + since nn here will have been incremented. */ +#if OPERATION_sec_div_qr + qh = mpn_sec_pi1_div_qr (np2 + dn, np2, nn, dp2, dn, inv32, tp + nn + dn); + ASSERT (qh == 0); /* FIXME: this indicates inefficiency! */ + MPN_COPY (qp, np2 + dn, nn - dn - 1); + qh = np2[nn - 1]; +#else + mpn_sec_pi1_div_r (np2, nn, dp2, dn, inv32, tp + nn + dn); +#endif + + mpn_rshift (np, np2, dn, cnt); + +#if OPERATION_sec_div_qr + return qh; +#endif + } + else + { + /* FIXME: Consider copying np => np2 here, adding a 0-limb at the top. + That would simplify the underlying pi1 function, since then it could + assume nn > dn. */ + d0 = dp[dn - 1]; + d0 += (~d0 != 0); + invert_limb (inv32, d0); + +#if OPERATION_sec_div_qr + return mpn_sec_pi1_div_qr (qp, np, nn, dp, dn, inv32, tp); +#else + mpn_sec_pi1_div_r (np, nn, dp, dn, inv32, tp); +#endif + } +} diff --git a/gmp-6.3.0/mpn/generic/sec_invert.c b/gmp-6.3.0/mpn/generic/sec_invert.c new file mode 100644 index 0000000..07665d1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_invert.c @@ -0,0 +1,177 @@ +/* mpn_sec_invert + + Contributed to the GNU project by Niels Möller + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if 0 +/* Currently unused. Should be resurrected once mpn_cnd_neg is + advertised. */ +static mp_size_t +mpn_cnd_neg_itch (mp_size_t n) +{ + return n; +} +#endif + +/* FIXME: Ought to return carry */ +static void +mpn_cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n, + mp_ptr scratch) +{ + mpn_lshift (scratch, ap, n, 1); + mpn_cnd_sub_n (cnd, rp, ap, scratch, n); +} + +static int +mpn_sec_eq_ui (mp_srcptr ap, mp_size_t n, mp_limb_t b) +{ + mp_limb_t d; + ASSERT (n > 0); + + d = ap[0] ^ b; + + while (--n > 0) + d |= ap[n]; + + return d == 0; +} + +mp_size_t +mpn_sec_invert_itch (mp_size_t n) +{ + return 4*n; +} + +/* Compute V <-- A^{-1} (mod M), in data-independent time. M must be + odd. Returns 1 on success, and 0 on failure (i.e., if gcd (A, m) != + 1). Inputs and outputs of size n, and no overlap allowed. The {ap, + n} area is destroyed. For arbitrary inputs, bit_size should be + 2*n*GMP_NUMB_BITS, but if A or M are known to be smaller, e.g., if + M = 2^521 - 1 and A < M, bit_size can be any bound on the sum of + the bit sizes of A and M. */ +int +mpn_sec_invert (mp_ptr vp, mp_ptr ap, mp_srcptr mp, + mp_size_t n, mp_bitcnt_t bit_size, + mp_ptr scratch) +{ + ASSERT (n > 0); + ASSERT (bit_size > 0); + ASSERT (mp[0] & 1); + ASSERT (! MPN_OVERLAP_P (ap, n, vp, n)); +#define bp (scratch + n) +#define up (scratch + 2*n) +#define m1hp (scratch + 3*n) + + /* Maintain + + a = u * orig_a (mod m) + b = v * orig_a (mod m) + + and b odd at all times. Initially, + + a = a_orig, u = 1 + b = m, v = 0 + */ + + + up[0] = 1; + mpn_zero (up+1, n - 1); + mpn_copyi (bp, mp, n); + mpn_zero (vp, n); + + ASSERT_CARRY (mpn_rshift (m1hp, mp, n, 1)); + ASSERT_NOCARRY (mpn_sec_add_1 (m1hp, m1hp, n, 1, scratch)); + + while (bit_size-- > 0) + { + mp_limb_t odd, swap, cy; + + /* Always maintain b odd. The logic of the iteration is as + follows. For a, b: + + odd = a & 1 + a -= odd * b + if (underflow from a-b) + { + b += a, assigns old a + a = B^n-a + } + + a /= 2 + + For u, v: + + if (underflow from a - b) + swap u, v + u -= odd * v + if (underflow from u - v) + u += m + + u /= 2 + if (a one bit was shifted out) + u += (m+1)/2 + + As long as a > 0, the quantity + + (bitsize of a) + (bitsize of b) + + is reduced by at least one bit per iteration, hence after (bit_size of + orig_a) + (bit_size of m) - 1 iterations we surely have a = 0. Then b + = gcd(orig_a, m) and if b = 1 then also v = orig_a^{-1} (mod m). + */ + + ASSERT (bp[0] & 1); + odd = ap[0] & 1; + + swap = mpn_cnd_sub_n (odd, ap, ap, bp, n); + mpn_cnd_add_n (swap, bp, bp, ap, n); + mpn_cnd_neg (swap, ap, ap, n, scratch); + + mpn_cnd_swap (swap, up, vp, n); + cy = mpn_cnd_sub_n (odd, up, up, vp, n); + cy -= mpn_cnd_add_n (cy, up, up, mp, n); + ASSERT (cy == 0); + + cy = mpn_rshift (ap, ap, n, 1); + ASSERT (cy == 0); + cy = mpn_rshift (up, up, n, 1); + cy = mpn_cnd_add_n (cy, up, up, m1hp, n); + ASSERT (cy == 0); + } + /* Should be all zeros, but check only extreme limbs */ + ASSERT ( (ap[0] | ap[n-1]) == 0); + /* Check if indeed gcd == 1. */ + return mpn_sec_eq_ui (bp, n, 1); +#undef bp +#undef up +#undef m1hp +} diff --git a/gmp-6.3.0/mpn/generic/sec_mul.c b/gmp-6.3.0/mpn/generic/sec_mul.c new file mode 100644 index 0000000..4bbfa61 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_mul.c @@ -0,0 +1,48 @@ +/* mpn_sec_mul. + + Contributed to the GNU project by Torbjörn Granlund. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_sec_mul (mp_ptr rp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr tp) +{ + mpn_mul_basecase (rp, ap, an, bp, bn); +} + +mp_size_t +mpn_sec_mul_itch (mp_size_t an, mp_size_t bn) +{ + return 0; +} diff --git a/gmp-6.3.0/mpn/generic/sec_pi1_div.c b/gmp-6.3.0/mpn/generic/sec_pi1_div.c new file mode 100644 index 0000000..29d01e7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_pi1_div.c @@ -0,0 +1,172 @@ +/* mpn_sec_pi1_div_qr, mpn_sec_pi1_div_r -- Compute Q = floor(U / V), U = U + mod V. Side-channel silent under the assumption that the used instructions + are side-channel silent. + + Contributed to the GNU project by Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011-2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* This side-channel silent division algorithm reduces the partial remainder by + GMP_NUMB_BITS/2 bits at a time, compared to GMP_NUMB_BITS for the main + division algorithm. We actually do not insist on reducing by exactly + GMP_NUMB_BITS/2, but may leave a partial remainder that is D*B^i to 3D*B^i + too large (B is the limb base, D is the divisor, and i is the induction + variable); the subsequent step will handle the extra partial remainder bits. + + With that partial remainder reduction, each step generates a quotient "half + limb". The outer loop generates two quotient half limbs, an upper (q1h) and + a lower (q0h) which are stored sparsely in separate limb arrays. These + arrays are added at the end; using separate arrays avoids data-dependent + carry propagation which could else pose a side-channel leakage problem. + + The quotient half limbs may be between -3 to 0 from the accurate value + ("accurate" being the one which corresponds to a reduction to a principal + partial remainder). Too small quotient half limbs correspond to too large + remainders, which we reduce later, as described above. + + In order to keep quotients from getting too big, corresponding to a negative + partial remainder, we use an inverse which is slightly smaller than usually. +*/ + +#if OPERATION_sec_pi1_div_qr +/* Needs (dn + 1) + (nn - dn) + (nn - dn) = 2nn - dn + 1 limbs at tp. */ +#define FNAME mpn_sec_pi1_div_qr +#define Q(q) q, +#define RETTYPE mp_limb_t +#endif +#if OPERATION_sec_pi1_div_r +/* Needs (dn + 1) limbs at tp. */ +#define FNAME mpn_sec_pi1_div_r +#define Q(q) +#define RETTYPE void +#endif + +RETTYPE +FNAME (Q(mp_ptr qp) + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv, + mp_ptr tp) +{ + mp_limb_t nh, cy, q1h, q0h, dummy, cnd; + mp_size_t i; + mp_ptr hp; +#if OPERATION_sec_pi1_div_qr + mp_limb_t qh; + mp_ptr qlp, qhp; +#endif + + ASSERT (dn >= 1); + ASSERT (nn >= dn); + ASSERT ((dp[dn - 1] & GMP_NUMB_HIGHBIT) != 0); + + if (nn == dn) + { + cy = mpn_sub_n (np, np, dp, dn); + mpn_cnd_add_n (cy, np, np, dp, dn); +#if OPERATION_sec_pi1_div_qr + return 1 - cy; +#else + return; +#endif + } + + /* Create a divisor copy shifted half a limb. */ + hp = tp; /* (dn + 1) limbs */ + hp[dn] = mpn_lshift (hp, dp, dn, GMP_NUMB_BITS / 2); + +#if OPERATION_sec_pi1_div_qr + qlp = tp + (dn + 1); /* (nn - dn) limbs */ + qhp = tp + (nn + 1); /* (nn - dn) limbs */ +#endif + + np += nn - dn; + nh = 0; + + for (i = nn - dn - 1; i >= 0; i--) + { + np--; + + nh = (nh << GMP_NUMB_BITS/2) + (np[dn] >> GMP_NUMB_BITS/2); + umul_ppmm (q1h, dummy, nh, dinv); + q1h += nh; +#if OPERATION_sec_pi1_div_qr + qhp[i] = q1h; +#endif + mpn_submul_1 (np, hp, dn + 1, q1h); + + nh = np[dn]; + umul_ppmm (q0h, dummy, nh, dinv); + q0h += nh; +#if OPERATION_sec_pi1_div_qr + qlp[i] = q0h; +#endif + nh -= mpn_submul_1 (np, dp, dn, q0h); + } + + /* 1st adjustment depends on extra high remainder limb. */ + cnd = nh != 0; /* FIXME: cmp-to-int */ +#if OPERATION_sec_pi1_div_qr + qlp[0] += cnd; +#endif + nh -= mpn_cnd_sub_n (cnd, np, np, dp, dn); + + /* 2nd adjustment depends on remainder/divisor comparison as well as whether + extra remainder limb was nullified by previous subtract. */ + cy = mpn_sub_n (np, np, dp, dn); + cy = cy - nh; +#if OPERATION_sec_pi1_div_qr + qlp[0] += 1 - cy; +#endif + mpn_cnd_add_n (cy, np, np, dp, dn); + + /* 3rd adjustment depends on remainder/divisor comparison. */ + cy = mpn_sub_n (np, np, dp, dn); +#if OPERATION_sec_pi1_div_qr + qlp[0] += 1 - cy; +#endif + mpn_cnd_add_n (cy, np, np, dp, dn); + +#if OPERATION_sec_pi1_div_qr + /* Combine quotient halves into final quotient. */ + qh = mpn_lshift (qhp, qhp, nn - dn, GMP_NUMB_BITS/2); + qh += mpn_add_n (qp, qhp, qlp, nn - dn); + + return qh; +#else + return; +#endif +} diff --git a/gmp-6.3.0/mpn/generic/sec_powm.c b/gmp-6.3.0/mpn/generic/sec_powm.c new file mode 100644 index 0000000..bba11cf --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_powm.c @@ -0,0 +1,430 @@ +/* mpn_sec_powm -- Compute R = U^E mod M. Secure variant, side-channel silent + under the assumption that the multiply instruction is side channel silent. + + Contributed to the GNU project by Torbjörn Granlund. + +Copyright 2007-2009, 2011-2014, 2018-2019, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd. + + 1. T <- (B^n * U) mod M; convert to REDC form + + 2. Compute table U^0, U^1, U^2... of floor(log(E))-dependent size + + 3. While there are more bits in E + W <- power left-to-right base-k + + The article "Defeating modexp side-channel attacks with data-independent + execution traces", https://gmplib.org/~tege/modexp-silent.pdf, has details. + + + TODO: + + * Make getbits a macro, thereby allowing it to update the index operand. + That will simplify the code using getbits. (Perhaps make getbits' sibling + getbit then have similar form, for symmetry.) + + * Choose window size without looping. (Superoptimize or think(tm).) + + * REDC_1_TO_REDC_2_THRESHOLD might actually represent the cutoff between + redc_1 and redc_n. On such systems, we will switch to redc_2 causing + slowdown. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +#undef MPN_REDC_1_SEC +#if HAVE_NATIVE_mpn_sbpi1_bdiv_r +#define MPN_REDC_1_SEC(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm); \ + mpn_cnd_sub_n (cy, rp, up + n, mp, n); \ + } while (0) +#else +#define MPN_REDC_1_SEC(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + cy = mpn_redc_1 (rp, up, mp, n, invm); \ + mpn_cnd_sub_n (cy, rp, rp, mp, n); \ + } while (0) +#endif + +#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 +#undef MPN_REDC_2_SEC +#define MPN_REDC_2_SEC(rp, up, mp, n, mip) \ + do { \ + mp_limb_t cy; \ + cy = mpn_redc_2 (rp, up, mp, n, mip); \ + mpn_cnd_sub_n (cy, rp, rp, mp, n); \ + } while (0) +#else +#define MPN_REDC_2_SEC(rp, up, mp, n, mip) /* empty */ +#undef REDC_1_TO_REDC_2_THRESHOLD +#define REDC_1_TO_REDC_2_THRESHOLD MP_SIZE_T_MAX +#endif + +/* Define our own mpn squaring function. We do this since we cannot use a + native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over + SQR_TOOM2_THRESHOLD. This is so because of fixed size stack allocations + made inside mpn_sqr_basecase. */ + +#if ! HAVE_NATIVE_mpn_sqr_basecase +/* The limit of the generic code is SQR_TOOM2_THRESHOLD. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif + +#if HAVE_NATIVE_mpn_sqr_basecase +#ifdef TUNE_SQR_TOOM2_MAX +/* We slightly abuse TUNE_SQR_TOOM2_MAX here. If it is set for an assembly + mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly + file. An assembly mpn_sqr_basecase that does not define it should allow + any size. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif +#endif + +#ifdef WANT_FAT_BINARY +/* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from + __gmpn_cpuvec. Perhaps any possible sqr_basecase.asm allow any size, and we + limit the use unnecessarily. We cannot tell, so play it safe. FIXME. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif + +#ifndef SQR_BASECASE_LIM +/* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand + size. */ +#define SQR_BASECASE_LIM MP_SIZE_T_MAX +#endif + +#define mpn_local_sqr(rp,up,n) \ + do { \ + if (ABOVE_THRESHOLD (n, SQR_BASECASE_THRESHOLD) \ + && BELOW_THRESHOLD (n, SQR_BASECASE_LIM)) \ + mpn_sqr_basecase (rp, up, n); \ + else \ + mpn_mul_basecase(rp, up, n, up, n); \ + } while (0) + +#define getbit(p,bi) \ + ((p[(bi - 1) / GMP_NUMB_BITS] >> (bi - 1) % GMP_NUMB_BITS) & 1) + +/* FIXME: Maybe some things would get simpler if all callers ensure + that bi >= nbits. As far as I understand, with the current code bi + < nbits can happen only for the final iteration. */ +static inline mp_limb_t +getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits) +{ + int nbits_in_r; + mp_limb_t r; + mp_size_t i; + + if (bi < nbits) + { + return p[0] & (((mp_limb_t) 1 << bi) - 1); + } + else + { + bi -= nbits; /* bit index of low bit to extract */ + i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */ + bi %= GMP_NUMB_BITS; /* bit index in low word */ + r = p[i] >> bi; /* extract (low) bits */ + nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */ + if (nbits_in_r < nbits) /* did we get enough bits? */ + r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */ + return r & (((mp_limb_t ) 1 << nbits) - 1); + } +} + +#ifndef POWM_SEC_TABLE +#if GMP_NUMB_BITS < 50 +#define POWM_SEC_TABLE 2,33,96,780,2741 +#else +#define POWM_SEC_TABLE 2,130,524,2578 +#endif +#endif + +#if TUNE_PROGRAM_BUILD +extern int win_size (mp_bitcnt_t); +#else +static inline int +win_size (mp_bitcnt_t enb) +{ + int k; + /* Find k, such that x[k-1] < enb <= x[k]. + + We require that x[k] >= k, then it follows that enb > x[k-1] >= + k-1, which implies k <= enb. + */ + static const mp_bitcnt_t x[] = {POWM_SEC_TABLE,~(mp_bitcnt_t)0}; + for (k = 0; enb > x[k++]; ) + ; + ASSERT (k <= enb); + return k; +} +#endif + +/* Convert U to REDC form, U_r = B^n * U mod M. + Uses scratch space at tp of size 2un + n + 1. */ +static void +redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp) +{ + MPN_ZERO (tp, n); + MPN_COPY (tp + n, up, un); + + mpn_sec_div_r (tp, un + n, mp, n, tp + un + n); + MPN_COPY (rp, tp, n); +} + +static mp_limb_t +sec_binvert_limb (mp_limb_t n) +{ + mp_limb_t inv, t; + ASSERT ((n & 1) == 1); + /* 3 + 2 -> 5 */ + inv = n + (((n + 1) << 1) & 0x18); + + t = n * inv; +#if GMP_NUMB_BITS <= 10 + /* 5 x 2 -> 10 */ + inv = 2 * inv - inv * t; +#else /* GMP_NUMB_BITS > 10 */ + /* 5 x 2 + 2 -> 12 */ + inv = 2 * inv - inv * t + ((inv<<10)&-(t&(1<<5))); +#endif /* GMP_NUMB_BITS <= 10 */ + + if (GMP_NUMB_BITS > 12) + { + t = n * inv - 1; + if (GMP_NUMB_BITS <= 36) + { + /* 12 x 3 -> 36 */ + inv += inv * t * (t - 1); + } + else /* GMP_NUMB_BITS > 36 */ + { + mp_limb_t t2 = t * t; +#if GMP_NUMB_BITS <= 60 + /* 12 x 5 -> 60 */ + inv += inv * (t2 + 1) * (t2 - t); +#else /* GMP_NUMB_BITS > 60 */ + /* 12 x 5 + 4 -> 64 */ + inv *= (t2 + 1) * (t2 - t) + 1 - ((t<<48)&-(t&(1<<12))); + + /* 64 -> 128 -> 256 -> ... */ + for (int todo = (GMP_NUMB_BITS - 1) >> 6; todo != 0; todo >>= 1) + inv = 2 * inv - inv * inv * n; +#endif /* GMP_NUMB_BITS <= 60 */ + } + } + + ASSERT ((inv * n & GMP_NUMB_MASK) == 1); + return inv & GMP_NUMB_MASK; +} + +/* {rp, n} <-- {bp, bn} ^ {ep, en} mod {mp, n}, + where en = ceil (enb / GMP_NUMB_BITS) + Requires that {mp, n} is odd (and hence also mp[0] odd). + Uses scratch space at tp as defined by mpn_sec_powm_itch. */ +void +mpn_sec_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, + mp_srcptr ep, mp_bitcnt_t enb, + mp_srcptr mp, mp_size_t n, mp_ptr tp) +{ + mp_limb_t ip[2], *mip; + int windowsize, this_windowsize; + mp_limb_t expbits; + mp_ptr pp, this_pp, ps; + long i; + int cnd; + + ASSERT (enb > 0); + ASSERT (n > 0); + /* The code works for bn = 0, but the defined scratch space is 2 limbs + greater than we supply, when converting 1 to redc form . */ + ASSERT (bn > 0); + ASSERT ((mp[0] & 1) != 0); + + windowsize = win_size (enb); + + mip = ip; + mip[0] = sec_binvert_limb (mp[0]); + if (ABOVE_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + mp_limb_t t, dummy, mip0 = mip[0]; + + umul_ppmm (t, dummy, mip0, mp[0]); + ASSERT (dummy == 1); + t += mip0 * mp[1]; /* t = (mp * mip0)[1] */ + + mip[1] = t * mip0 - 1; /* ~( - t * mip0) */ + } + mip[0] = -mip[0]; + + pp = tp; + tp += (n << windowsize); /* put tp after power table */ + + /* Compute pp[0] table entry */ + /* scratch: | n | 1 | n+2 | */ + /* | pp[0] | 1 | redcify | */ + this_pp = pp; + this_pp[n] = 1; + redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1); + this_pp += n; + + /* Compute pp[1] table entry. To avoid excessive scratch usage in the + degenerate situation where B >> M, we let redcify use scratch space which + will later be used by the pp table (element 2 and up). */ + /* scratch: | n | n | bn + n + 1 | */ + /* | pp[0] | pp[1] | redcify | */ + redcify (this_pp, bp, bn, mp, n, this_pp + n); + + /* Precompute powers of b and put them in the temporary area at pp. */ + /* scratch: | n | n | ... | | 2n | */ + /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | product | */ + ps = pp + n; /* initially B^1 */ + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + for (i = (1 << windowsize) - 2; i > 0; i -= 2) + { + mpn_local_sqr (tp, ps, n); + ps += n; + this_pp += n; + MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]); + + mpn_mul_basecase (tp, this_pp, n, pp + n, n); + this_pp += n; + MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]); + } + } + else + { + for (i = (1 << windowsize) - 2; i > 0; i -= 2) + { + mpn_local_sqr (tp, ps, n); + ps += n; + this_pp += n; + MPN_REDC_2_SEC (this_pp, tp, mp, n, mip); + + mpn_mul_basecase (tp, this_pp, n, pp + n, n); + this_pp += n; + MPN_REDC_2_SEC (this_pp, tp, mp, n, mip); + } + } + + expbits = getbits (ep, enb, windowsize); + ASSERT_ALWAYS (enb >= windowsize); + enb -= windowsize; + + mpn_sec_tabselect (rp, pp, n, 1 << windowsize, expbits); + + /* Main exponentiation loop. */ + /* scratch: | n | n | ... | | 3n-4n | */ + /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | loop scratch | */ + +#define INNERLOOP \ + while (enb != 0) \ + { \ + expbits = getbits (ep, enb, windowsize); \ + this_windowsize = windowsize; \ + if (enb < windowsize) \ + { \ + this_windowsize -= windowsize - enb; \ + enb = 0; \ + } \ + else \ + enb -= windowsize; \ + \ + do \ + { \ + mpn_local_sqr (tp, rp, n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + this_windowsize--; \ + } \ + while (this_windowsize != 0); \ + \ + mpn_sec_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits); \ + mpn_mul_basecase (tp, rp, n, tp + 2*n, n); \ + \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + } + + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { +#undef MPN_REDUCE +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_REDUCE +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2_SEC (rp, tp, mp, n, mip) + INNERLOOP; + } + + MPN_COPY (tp, rp, n); + MPN_ZERO (tp + n, n); + + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]); + else + MPN_REDC_2_SEC (rp, tp, mp, n, mip); + + cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */ + mpn_cnd_sub_n (!cnd, rp, rp, mp, n); +} + +mp_size_t +mpn_sec_powm_itch (mp_size_t bn, mp_bitcnt_t enb, mp_size_t n) +{ + int windowsize; + mp_size_t redcify_itch, itch; + + /* FIXME: no more _local/_basecase difference. */ + /* The top scratch usage will either be when reducing B in the 2nd redcify + call, or more typically n*2^windowsize + 3n or 4n, in the main loop. (It + is 3n or 4n depending on if we use mpn_local_sqr or a native + mpn_sqr_basecase. We assume 4n always for now.) */ + + windowsize = win_size (enb); + + /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call, + the (bn + n) term is due to redcify's own usage, and the rest is due to + mpn_sec_div_r's usage when called from redcify. */ + redcify_itch = (2 * n) + (bn + n) + ((bn + n) + 2 * n + 2); + + /* The n * 2^windowsize term is due to the power table, the 4n term is due to + scratch needs of squaring/multiplication in the exponentiation loop. */ + itch = (n << windowsize) + (4 * n); + + return MAX (itch, redcify_itch); +} diff --git a/gmp-6.3.0/mpn/generic/sec_sqr.c b/gmp-6.3.0/mpn/generic/sec_sqr.c new file mode 100644 index 0000000..83fc7d9 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_sqr.c @@ -0,0 +1,76 @@ +/* mpn_sec_sqr. + + Contributed to the GNU project by Torbjörn Granlund. + +Copyright 2013, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if ! HAVE_NATIVE_mpn_sqr_basecase +/* The limit of the generic code is SQR_TOOM2_THRESHOLD. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif + +#if HAVE_NATIVE_mpn_sqr_basecase +#ifdef TUNE_SQR_TOOM2_MAX +/* We slightly abuse TUNE_SQR_TOOM2_MAX here. If it is set for an assembly + mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly + file. An assembly mpn_sqr_basecase that does not define it should allow + any size. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif +#endif + +#ifdef WANT_FAT_BINARY +/* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from + __gmpn_cpuvec. Perhaps any possible sqr_basecase.asm allow any size, and we + limit the use unnecessarily. We cannot tell, so play it safe. FIXME. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif + +void +mpn_sec_sqr (mp_ptr rp, + mp_srcptr ap, mp_size_t an, + mp_ptr tp) +{ +#ifndef SQR_BASECASE_LIM +/* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand + size. */ + mpn_sqr_basecase (rp, ap, an); +#else +/* Else use mpn_mul_basecase. */ + mpn_mul_basecase (rp, ap, an, ap, an); +#endif +} + +mp_size_t +mpn_sec_sqr_itch (mp_size_t an) +{ + return 0; +} diff --git a/gmp-6.3.0/mpn/generic/sec_tabselect.c b/gmp-6.3.0/mpn/generic/sec_tabselect.c new file mode 100644 index 0000000..f50bdac --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_tabselect.c @@ -0,0 +1,134 @@ +/* mpn_sec_tabselect. + +Copyright 2007-2009, 2011, 2013, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +#ifndef SEC_TABSELECT_METHOD +#define SEC_TABSELECT_METHOD 1 +#endif + +/* Select entry `which' from table `tab', which has nents entries, each `n' + limbs. Store the selected entry at rp. Reads entire table to avoid + side-channel information leaks. O(n*nents). */ + +#if SEC_TABSELECT_METHOD == 1 +void +mpn_sec_tabselect (volatile mp_limb_t *rp, volatile const mp_limb_t *tab, + mp_size_t n, mp_size_t nents, mp_size_t which) +{ + mp_size_t k, i; + mp_limb_t mask; + volatile const mp_limb_t *tp; + + tp = tab; + + /* Place first entry into result area. */ + for (i = 0; i < n; i++) + rp[i] = tp[i]; + + /* Conditionally replace entry in result area by entry 1...(nents-1) using + masking trickery. */ + for (k = 1; k < nents; k++) + { + /* Generate a mask using an expression which all compilers should compile + into branch-free code. The convoluted expression is designed to both + allow mp_limb_t greater and mp_limb_t smaller than mp_size_t. */ + mask = -(mp_limb_t) ((-(unsigned long) (which ^ k)) >> (BITS_PER_ULONG - 1)); + tp += n; + for (i = 0; i < n; i++) + rp[i] = (rp[i] & mask) | (tp[i] & ~mask); + } +} +#endif + +#if SEC_TABSELECT_METHOD == 2 +void +mpn_sec_tabselect (volatile mp_limb_t * restrict rp, + volatile const mp_limb_t * restrict tab, + mp_size_t n, mp_size_t nents, mp_size_t which) +{ + mp_size_t k, i; + mp_limb_t mask, r0, r1, r2, r3; + volatile const mp_limb_t * restrict tp; + + if (n & 1) + { + tp = tab; + r0 = 0; + for (k = 0; k < nents; k++) + { + mask = (mp_limb_t) ((-(unsigned long) (which ^ k)) >> (BITS_PER_ULONG - 1)) - 1; + r0 += tp[0] & mask; + tp += n; + } + rp[0] = r0; + rp += 1; + tab += 1; + } + + if (n & 2) + { + tp = tab; + r0 = r1 = 0; + for (k = 0; k < nents; k++) + { + mask = (mp_limb_t) ((-(unsigned long) (which ^ k)) >> (BITS_PER_ULONG - 1)) - 1; + r0 += tp[0] & mask; + r1 += tp[1] & mask; + tp += n; + } + rp[0] = r0; + rp[1] = r1; + rp += 2; + tab += 2; + } + + for (i = 0; i <= n - 4; i += 4) + { + tp = tab + i; + r0 = r1 = r2 = r3 = 0; + for (k = 0; k < nents; k++) + { + mask = (mp_limb_t) ((-(unsigned long) (which ^ k)) >> (BITS_PER_ULONG - 1)) - 1; + r0 += tp[0] & mask; + r1 += tp[1] & mask; + r2 += tp[2] & mask; + r3 += tp[3] & mask; + tp += n; + } + rp[0] = r0; + rp[1] = r1; + rp[2] = r2; + rp[3] = r3; + rp += 4; + } +} +#endif diff --git a/gmp-6.3.0/mpn/generic/set_str.c b/gmp-6.3.0/mpn/generic/set_str.c new file mode 100644 index 0000000..2bd584c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/set_str.c @@ -0,0 +1,290 @@ +/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base) -- + Convert a STR_LEN long base BASE byte string pointed to by STR to a limb + vector pointed to by RES_PTR. Return the number of limbs in RES_PTR. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE, EXCEPT mpn_set_str, ARE INTERNAL WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* TODO: + + Perhaps do not compute the highest power? + Instead, multiply twice by the 2nd highest power: + + _______ + |_______| hp + |_______| pow + _______________ + |_______________| final result + + + _______ + |_______| hp + |___| pow[-1] + ___________ + |___________| intermediate result + |___| pow[-1] + _______________ + |_______________| final result + + Generalizing that idea, perhaps we should make powtab contain successive + cubes, not squares. +*/ + +#include "gmp-impl.h" + +mp_size_t +mpn_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base) +{ + if (POW2_P (base)) + { + /* The base is a power of 2. Read the input string from least to most + significant character/digit. */ + + const unsigned char *s; + int next_bitpos; + mp_limb_t res_digit; + mp_size_t size; + int bits_per_indigit = mp_bases[base].big_base; + + size = 0; + res_digit = 0; + next_bitpos = 0; + + for (s = str + str_len - 1; s >= str; s--) + { + int inp_digit = *s; + + res_digit |= ((mp_limb_t) inp_digit << next_bitpos) & GMP_NUMB_MASK; + next_bitpos += bits_per_indigit; + if (next_bitpos >= GMP_NUMB_BITS) + { + rp[size++] = res_digit; + next_bitpos -= GMP_NUMB_BITS; + res_digit = inp_digit >> (bits_per_indigit - next_bitpos); + } + } + + if (res_digit != 0) + rp[size++] = res_digit; + return size; + } + + if (BELOW_THRESHOLD (str_len, SET_STR_PRECOMPUTE_THRESHOLD)) + return mpn_bc_set_str (rp, str, str_len, base); + else + { + mp_ptr powtab_mem, tp; + powers_t powtab[GMP_LIMB_BITS]; + int chars_per_limb; + mp_size_t size; + mp_size_t un; + TMP_DECL; + + TMP_MARK; + + chars_per_limb = mp_bases[base].chars_per_limb; + + un = str_len / chars_per_limb + 1; /* FIXME: scalar integer division */ + + /* Allocate one large block for the powers of big_base. */ + powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un)); + + size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base); + powers_t *pt = powtab + n_pows; + + tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); + size = mpn_dc_set_str (rp, str, str_len, pt, tp); + + TMP_FREE; + return size; + } +} + +mp_size_t +mpn_dc_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, + const powers_t *powtab, mp_ptr tp) +{ + size_t len_lo, len_hi; + mp_limb_t cy; + mp_size_t ln, hn, n, sn; + + len_lo = powtab->digits_in_base; + + if (str_len <= len_lo) + { + if (BELOW_THRESHOLD (str_len, SET_STR_DC_THRESHOLD)) + return mpn_bc_set_str (rp, str, str_len, powtab->base); + else + return mpn_dc_set_str (rp, str, str_len, powtab - 1, tp); + } + + len_hi = str_len - len_lo; + ASSERT (len_lo >= len_hi); + + if (BELOW_THRESHOLD (len_hi, SET_STR_DC_THRESHOLD)) + hn = mpn_bc_set_str (tp, str, len_hi, powtab->base); + else + hn = mpn_dc_set_str (tp, str, len_hi, powtab - 1, rp); + + sn = powtab->shift; + + if (hn == 0) + { + /* Zero +1 limb here, to avoid reading an allocated but uninitialised + limb in mpn_incr_u below. */ + MPN_ZERO (rp, powtab->n + sn + 1); + } + else + { + if (powtab->n > hn) + mpn_mul (rp + sn, powtab->p, powtab->n, tp, hn); + else + mpn_mul (rp + sn, tp, hn, powtab->p, powtab->n); + MPN_ZERO (rp, sn); + } + + str = str + str_len - len_lo; + if (BELOW_THRESHOLD (len_lo, SET_STR_DC_THRESHOLD)) + ln = mpn_bc_set_str (tp, str, len_lo, powtab->base); + else + ln = mpn_dc_set_str (tp, str, len_lo, powtab - 1, tp + powtab->n + sn + 1); + + if (ln != 0) + { + cy = mpn_add_n (rp, rp, tp, ln); + mpn_incr_u (rp + ln, cy); + } + n = hn + powtab->n + sn; + return n - (rp[n - 1] == 0); +} + +mp_size_t +mpn_bc_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base) +{ + mp_size_t size; + size_t i; + long j; + mp_limb_t cy_limb; + + mp_limb_t big_base; + int chars_per_limb; + mp_limb_t res_digit; + + ASSERT (base >= 2); + ASSERT (base < numberof (mp_bases)); + ASSERT (str_len >= 1); + + big_base = mp_bases[base].big_base; + chars_per_limb = mp_bases[base].chars_per_limb; + + size = 0; + for (i = chars_per_limb; i < str_len; i += chars_per_limb) + { + res_digit = *str++; + if (base == 10) + { /* This is a common case. + Help the compiler to avoid multiplication. */ + for (j = MP_BASES_CHARS_PER_LIMB_10 - 1; j != 0; j--) + res_digit = res_digit * 10 + *str++; + } + else + { + for (j = chars_per_limb - 1; j != 0; j--) + res_digit = res_digit * base + *str++; + } + + if (size == 0) + { + if (res_digit != 0) + { + rp[0] = res_digit; + size = 1; + } + } + else + { +#if HAVE_NATIVE_mpn_mul_1c + cy_limb = mpn_mul_1c (rp, rp, size, big_base, res_digit); +#else + cy_limb = mpn_mul_1 (rp, rp, size, big_base); + cy_limb += mpn_add_1 (rp, rp, size, res_digit); +#endif + if (cy_limb != 0) + rp[size++] = cy_limb; + } + } + + big_base = base; + res_digit = *str++; + if (base == 10) + { /* This is a common case. + Help the compiler to avoid multiplication. */ + for (j = str_len - (i - MP_BASES_CHARS_PER_LIMB_10) - 1; j > 0; j--) + { + res_digit = res_digit * 10 + *str++; + big_base *= 10; + } + } + else + { + for (j = str_len - (i - chars_per_limb) - 1; j > 0; j--) + { + res_digit = res_digit * base + *str++; + big_base *= base; + } + } + + if (size == 0) + { + if (res_digit != 0) + { + rp[0] = res_digit; + size = 1; + } + } + else + { +#if HAVE_NATIVE_mpn_mul_1c + cy_limb = mpn_mul_1c (rp, rp, size, big_base, res_digit); +#else + cy_limb = mpn_mul_1 (rp, rp, size, big_base); + cy_limb += mpn_add_1 (rp, rp, size, res_digit); +#endif + if (cy_limb != 0) + rp[size++] = cy_limb; + } + return size; +} diff --git a/gmp-6.3.0/mpn/generic/sizeinbase.c b/gmp-6.3.0/mpn/generic/sizeinbase.c new file mode 100644 index 0000000..faee947 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sizeinbase.c @@ -0,0 +1,49 @@ +/* mpn_sizeinbase -- approximation to chars required for an mpn. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 1991, 1993-1995, 2001, 2002, 2011, 2012 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Same as mpz_sizeinbase, meaning exact for power-of-2 bases, and either + exact or 1 too big for other bases. */ + +size_t +mpn_sizeinbase (mp_srcptr xp, mp_size_t xsize, int base) +{ + size_t result; + MPN_SIZEINBASE (result, xp, xsize, base); + return result; +} diff --git a/gmp-6.3.0/mpn/generic/sqr.c b/gmp-6.3.0/mpn/generic/sqr.c new file mode 100644 index 0000000..74fbff0 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqr.c @@ -0,0 +1,98 @@ +/* mpn_sqr -- square natural numbers. + +Copyright 1991, 1993, 1994, 1996-2003, 2005, 2008, 2009 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); + + if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ + mpn_mul_basecase (p, a, n, a, n); + } + else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) + { + mpn_sqr_basecase (p, a, n); + } + else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) + { + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)]; + ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); + mpn_toom2_sqr (p, a, n, ws); + } + else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n)); + mpn_toom3_sqr (p, a, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n)); + mpn_toom4_sqr (p, a, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n)); + mpn_toom6_sqr (p, a, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) + { + mp_ptr ws; + TMP_DECL; + TMP_MARK; + ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n)); + mpn_toom8_sqr (p, a, n, ws); + TMP_FREE; + } + else + { + /* The current FFT code allocates its own space. That should probably + change. */ + mpn_fft_mul (p, a, n, a, n); + } +} diff --git a/gmp-6.3.0/mpn/generic/sqr_basecase.c b/gmp-6.3.0/mpn/generic/sqr_basecase.c new file mode 100644 index 0000000..2645bad --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqr_basecase.c @@ -0,0 +1,361 @@ +/* mpn_sqr_basecase -- Internal routine to square a natural number + of length n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2017 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if HAVE_NATIVE_mpn_sqr_diagonal +#define MPN_SQR_DIAGONAL(rp, up, n) \ + mpn_sqr_diagonal (rp, up, n) +#else +#define MPN_SQR_DIAGONAL(rp, up, n) \ + do { \ + mp_size_t _i; \ + for (_i = 0; _i < (n); _i++) \ + { \ + mp_limb_t ul, lpl; \ + ul = (up)[_i]; \ + umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS); \ + (rp)[2 * _i] = lpl >> GMP_NAIL_BITS; \ + } \ + } while (0) +#endif + +#if HAVE_NATIVE_mpn_sqr_diag_addlsh1 +#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \ + mpn_sqr_diag_addlsh1 (rp, tp, up, n) +#else +#if HAVE_NATIVE_mpn_addlsh1_n +#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + mp_limb_t cy; \ + MPN_SQR_DIAGONAL (rp, up, n); \ + cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); \ + rp[2 * n - 1] += cy; \ + } while (0) +#else +#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + mp_limb_t cy; \ + MPN_SQR_DIAGONAL (rp, up, n); \ + cy = mpn_lshift (tp, tp, 2 * n - 2, 1); \ + cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); \ + rp[2 * n - 1] += cy; \ + } while (0) +#endif +#endif + + +#undef READY_WITH_mpn_sqr_basecase + + +#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2s +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= SQR_TOOM2_THRESHOLD); + + if ((n & 1) != 0) + { + if (n == 1) + { + mp_limb_t ul, lpl; + ul = up[0]; + umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); + rp[0] = lpl >> GMP_NAIL_BITS; + return; + } + + MPN_ZERO (tp, n); + + for (i = 0; i <= n - 2; i += 2) + { + cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i); + tp[n + i] = cy; + } + } + else + { + if (n == 2) + { +#if HAVE_NATIVE_mpn_mul_2 + rp[3] = mpn_mul_2 (rp, up, 2, up); +#else + rp[0] = 0; + rp[1] = 0; + rp[3] = mpn_addmul_2 (rp, up, 2, up); +#endif + return; + } + + MPN_ZERO (tp, n); + + for (i = 0; i <= n - 4; i += 2) + { + cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i); + tp[n + i] = cy; + } + cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]); + tp[2 * n - 3] = cy; + } + + MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n); +} +#define READY_WITH_mpn_sqr_basecase +#endif + + +#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2 + +/* mpn_sqr_basecase using plain mpn_addmul_2. + + This is tricky, since we have to let mpn_addmul_2 make some undesirable + multiplies, u[k]*u[k], that we would like to let mpn_sqr_diagonal handle. + This forces us to conditionally add or subtract the mpn_sqr_diagonal + results. Examples of the product we form: + + n = 4 n = 5 n = 6 + u1u0 * u3u2u1 u1u0 * u4u3u2u1 u1u0 * u5u4u3u2u1 + u2 * u3 u3u2 * u4u3 u3u2 * u5u4u3 + u4 * u5 + add: u0 u2 u3 add: u0 u2 u4 add: u0 u2 u4 u5 + sub: u1 sub: u1 u3 sub: u1 u3 +*/ + +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= SQR_TOOM2_THRESHOLD); + + if ((n & 1) != 0) + { + mp_limb_t x0, x1; + + if (n == 1) + { + mp_limb_t ul, lpl; + ul = up[0]; + umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); + rp[0] = lpl >> GMP_NAIL_BITS; + return; + } + + /* The code below doesn't like unnormalized operands. Since such + operands are unusual, handle them with a dumb recursion. */ + if (up[n - 1] == 0) + { + rp[2 * n - 2] = 0; + rp[2 * n - 1] = 0; + mpn_sqr_basecase (rp, up, n - 1); + return; + } + + MPN_ZERO (tp, n); + + for (i = 0; i <= n - 2; i += 2) + { + cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i); + tp[n + i] = cy; + } + + MPN_SQR_DIAGONAL (rp, up, n); + + for (i = 2;; i += 4) + { + x0 = rp[i + 0]; + rp[i + 0] = (-x0) & GMP_NUMB_MASK; + x1 = rp[i + 1]; + rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK; + __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0); + if (i + 4 >= 2 * n) + break; + mpn_incr_u (rp + i + 4, cy); + } + } + else + { + mp_limb_t x0, x1; + + if (n == 2) + { +#if HAVE_NATIVE_mpn_mul_2 + rp[3] = mpn_mul_2 (rp, up, 2, up); +#else + rp[0] = 0; + rp[1] = 0; + rp[3] = mpn_addmul_2 (rp, up, 2, up); +#endif + return; + } + + /* The code below doesn't like unnormalized operands. Since such + operands are unusual, handle them with a dumb recursion. */ + if (up[n - 1] == 0) + { + rp[2 * n - 2] = 0; + rp[2 * n - 1] = 0; + mpn_sqr_basecase (rp, up, n - 1); + return; + } + + MPN_ZERO (tp, n); + + for (i = 0; i <= n - 4; i += 2) + { + cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i); + tp[n + i] = cy; + } + cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]); + tp[2 * n - 3] = cy; + + MPN_SQR_DIAGONAL (rp, up, n); + + for (i = 2;; i += 4) + { + x0 = rp[i + 0]; + rp[i + 0] = (-x0) & GMP_NUMB_MASK; + x1 = rp[i + 1]; + rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK; + if (i + 6 >= 2 * n) + break; + __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0); + mpn_incr_u (rp + i + 4, cy); + } + mpn_decr_u (rp + i + 2, (x1 | x0) != 0); + } + +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); +#else + cy = mpn_lshift (tp, tp, 2 * n - 2, 1); + cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); +#endif + rp[2 * n - 1] += cy; +} +#define READY_WITH_mpn_sqr_basecase +#endif + + +#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_sqr_diag_addlsh1 + +/* mpn_sqr_basecase using mpn_addmul_1 and mpn_sqr_diag_addlsh1, avoiding stack + allocation. */ +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + if (n == 1) + { + mp_limb_t ul, lpl; + ul = up[0]; + umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); + rp[0] = lpl >> GMP_NAIL_BITS; + } + else + { + mp_size_t i; + mp_ptr xp; + + rp += 1; + rp[n - 1] = mpn_mul_1 (rp, up + 1, n - 1, up[0]); + for (i = n - 2; i != 0; i--) + { + up += 1; + rp += 2; + rp[i] = mpn_addmul_1 (rp, up + 1, i, up[0]); + } + + xp = rp - 2 * n + 3; + mpn_sqr_diag_addlsh1 (xp, xp + 1, up - n + 2, n); + } +} +#define READY_WITH_mpn_sqr_basecase +#endif + + +#if ! defined (READY_WITH_mpn_sqr_basecase) + +/* Default mpn_sqr_basecase using mpn_addmul_1. */ +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n)); + + if (n == 1) + { + mp_limb_t ul, lpl; + ul = up[0]; + umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); + rp[0] = lpl >> GMP_NAIL_BITS; + } + else + { + mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= SQR_TOOM2_THRESHOLD); + + cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); + tp[n - 1] = cy; + for (i = 2; i < n; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); + tp[n + i - 2] = cy; + } + + MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n); + } +} +#define READY_WITH_mpn_sqr_basecase +#endif diff --git a/gmp-6.3.0/mpn/generic/sqrlo.c b/gmp-6.3.0/mpn/generic/sqrlo.c new file mode 100644 index 0000000..71530b6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqrlo.c @@ -0,0 +1,239 @@ +/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs + of the result. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS + FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_range_basecase 1 +#define MAYBE_range_toom22 1 +#else +#define MAYBE_range_basecase \ + ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11)) +#define MAYBE_range_toom22 \ + ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) ) +#endif + +/* THINK: The DC strategy uses different constants in different Toom's + ranges. Something smoother? +*/ + +/* + Compute the least significant half of the product {xy,n}*{yp,n}, or + formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). + + Above the given threshold, the Divide and Conquer strategy is used. + The operand is split in two, and a full square plus a mullo + is used to obtain the final result. The more natural strategy is to + split in two halves, but this is far from optimal when a + sub-quadratic multiplication is used. + + Mulders suggests an unbalanced split in favour of the full product, + split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. + + To compute the value of a, we assume that the cost of mullo for a + given size ML(n) is a fraction of the cost of a full product with + same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; + then we can write: + + ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e + + Given a value for e, want to minimise the value of k, i.e. the + function k=(1-a)^e/(1-2*a^e). + + With e=2, the exponent for schoolbook multiplication, the minimum is + given by the values a=1-a=1/2. + + With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), + Mulders compute (1-a) = 0.694... and we approximate a with 11/36. + + Other possible approximations follow: + e=log(5)/log(3) [Toom-3] -> a ~= 9/40 + e=log(7)/log(4) [Toom-4] -> a ~= 7/39 + e=log(11)/log(6) [Toom-6] -> a ~= 1/8 + e=log(15)/log(8) [Toom-8] -> a ~= 1/10 + + The values above where obtained with the following trivial commands + in the gp-pari shell: + +fun(e,a)=(1-a)^e/(1-2*a^e) +mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)= 2); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); + + /* Divide-and-conquer */ + + /* We need fractional approximation of the value 0 < a <= 1/2 + giving the minimum in the function k=(1-a)^e/(1-2*a^e). + */ + if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11))) + n1 = n >> 1; + else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11))) + n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ + else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9))) + n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ + else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9)) + n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ + /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ + else + n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ + + n2 = n - n1; + + /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */ + + /* x0 ^ 2 */ + mpn_sqr (tp, xp, n2); + MPN_COPY (rp, tp, n2); + + /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */ + if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) + mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1); + else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) + mpn_mullo_basecase (tp + n, xp + n2, xp, n1); + else + mpn_mullo_n (tp + n, xp + n2, xp, n1); + /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */ +#if HAVE_NATIVE_mpn_addlsh1_n + mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1); +#else + mpn_lshift (rp + n2, tp + n, n1, 1); + mpn_add_n (rp + n2, rp + n2, tp + n2, n1); +#endif +} + +/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ +#define SQR_BASECASE_ALLOC \ + (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT) + +/* FIXME: This function should accept a temporary area; dc_sqrlo + accepts a pointer tp, and handle the case tp == rp, do the same here. +*/ + +void +mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + + if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD)) + { + /* FIXME: smarter criteria? */ +#if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase + /* mullo computes as many products as sqr, but directly writes + on the result area. */ + mpn_mullo_basecase (rp, xp, xp, n); +#else + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t tp[SQR_BASECASE_ALLOC]; + mpn_sqr_basecase (tp, xp, n); + MPN_COPY (rp, tp, n); +#endif + } + else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD)) + { + mpn_sqrlo_basecase (rp, xp, n); + } + else + { + mp_ptr tp; + TMP_DECL; + TMP_MARK; + tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n)); + if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD)) + { + mpn_dc_sqrlo (rp, xp, n, tp); + } + else + { + /* For really large operands, use plain mpn_mul_n but throw away upper n + limbs of result. */ +#if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD) + mpn_fft_mul (tp, xp, n, xp, n); +#else + mpn_sqr (tp, xp, n); +#endif + MPN_COPY (rp, tp, n); + } + TMP_FREE; + } +} diff --git a/gmp-6.3.0/mpn/generic/sqrlo_basecase.c b/gmp-6.3.0/mpn/generic/sqrlo_basecase.c new file mode 100644 index 0000000..3148609 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqrlo_basecase.c @@ -0,0 +1,194 @@ +/* mpn_sqrlo_basecase -- Internal routine to square a natural number + of length n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2015, +2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef SQRLO_SHORTCUT_MULTIPLICATIONS +#if HAVE_NATIVE_mpn_addmul_1 +#define SQRLO_SHORTCUT_MULTIPLICATIONS 0 +#else +#define SQRLO_SHORTCUT_MULTIPLICATIONS 1 +#endif +#endif + +#if HAVE_NATIVE_mpn_sqr_diagonal +#define MPN_SQR_DIAGONAL(rp, up, n) \ + mpn_sqr_diagonal (rp, up, n) +#else +#define MPN_SQR_DIAGONAL(rp, up, n) \ + do { \ + mp_size_t _i; \ + for (_i = 0; _i < (n); _i++) \ + { \ + mp_limb_t ul, lpl; \ + ul = (up)[_i]; \ + umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS); \ + (rp)[2 * _i] = lpl >> GMP_NAIL_BITS; \ + } \ + } while (0) +#endif + +#define MPN_SQRLO_DIAGONAL(rp, up, n) \ + do { \ + mp_size_t nhalf; \ + nhalf = (n) >> 1; \ + MPN_SQR_DIAGONAL ((rp), (up), nhalf); \ + if (((n) & 1) != 0) \ + { \ + mp_limb_t op; \ + op = (up)[nhalf]; \ + (rp)[(n) - 1] = (op * op) & GMP_NUMB_MASK; \ + } \ + } while (0) + +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 +#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + MPN_SQRLO_DIAGONAL((rp), (up), (n)); \ + mpn_addlsh1_n_ip1 ((rp) + 1, (tp), (n) - 1); \ + } while (0) +#else +#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + MPN_SQRLO_DIAGONAL((rp), (up), (n)); \ + mpn_lshift ((tp), (tp), (n) - 1, 1); \ + mpn_add_n ((rp) + 1, (rp) + 1, (tp), (n) - 1); \ + } while (0) +#endif + +/* Avoid zero allocations when SQRLO_LO_THRESHOLD is 0 (this code not used). */ +#define SQRLO_BASECASE_ALLOC \ + (SQRLO_DC_THRESHOLD_LIMIT < 2 ? 1 : SQRLO_DC_THRESHOLD_LIMIT - 1) + +/* Default mpn_sqrlo_basecase using mpn_addmul_1. */ +#ifndef SQRLO_SPECIAL_CASES +#define SQRLO_SPECIAL_CASES 2 +#endif + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_special_cases 1 +#else +#define MAYBE_special_cases \ + ((SQRLO_BASECASE_THRESHOLD <= SQRLO_SPECIAL_CASES) && (SQRLO_DC_THRESHOLD != 0)) +#endif + +void +mpn_sqrlo_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_limb_t ul; + + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n, up, n)); + + ul = up[0]; + + if (MAYBE_special_cases && n <= SQRLO_SPECIAL_CASES) + { +#if SQRLO_SPECIAL_CASES == 1 + rp[0] = (ul * ul) & GMP_NUMB_MASK; +#else + if (n == 1) + rp[0] = (ul * ul) & GMP_NUMB_MASK; + else + { + mp_limb_t hi, lo, ul1; + umul_ppmm (hi, lo, ul, ul << GMP_NAIL_BITS); + rp[0] = lo >> GMP_NAIL_BITS; + ul1 = up[1]; +#if SQRLO_SPECIAL_CASES == 2 + rp[1] = (hi + ul * ul1 * 2) & GMP_NUMB_MASK; +#else + if (n == 2) + rp[1] = (hi + ul * ul1 * 2) & GMP_NUMB_MASK; + else + { + mp_limb_t hi1; +#if GMP_NAIL_BITS != 0 + ul <<= 1; +#endif + umul_ppmm (hi1, lo, ul1 << GMP_NAIL_BITS, ul); + hi1 += ul * up[2]; +#if GMP_NAIL_BITS == 0 + hi1 = (hi1 << 1) | (lo >> (GMP_LIMB_BITS - 1)); + add_ssaaaa(rp[2], rp[1], hi1, lo << 1, ul1 * ul1, hi); +#else + hi += lo >> GMP_NAIL_BITS; + rp[1] = hi & GMP_NUMB_MASK; + rp[2] = (hi1 + ul1 * ul1 + (hi >> GMP_NUMB_BITS)) & GMP_NUMB_MASK; +#endif + } +#endif + } +#endif + } + else + { + mp_limb_t tp[SQRLO_BASECASE_ALLOC]; + mp_size_t i; + + /* must fit n-1 limbs in tp */ + ASSERT (n <= SQRLO_DC_THRESHOLD_LIMIT); + + --n; +#if SQRLO_SHORTCUT_MULTIPLICATIONS + { + mp_limb_t cy; + + cy = ul * up[n] + mpn_mul_1 (tp, up + 1, n - 1, ul); + for (i = 1; 2 * i + 1 < n; ++i) + { + ul = up[i]; + cy += ul * up[n - i] + mpn_addmul_1 (tp + 2 * i, up + i + 1, n - 2 * i - 1, ul); + } + tp [n-1] = (cy + ((n & 1)?up[i] * up[i + 1]:0)) & GMP_NUMB_MASK; + } +#else + mpn_mul_1 (tp, up + 1, n, ul); + for (i = 1; 2 * i < n; ++i) + mpn_addmul_1 (tp + 2 * i, up + i + 1, n - 2 * i, up[i]); +#endif + + MPN_SQRLO_DIAG_ADDLSH1 (rp, tp, up, n + 1); + } +} +#undef SQRLO_SPECIAL_CASES +#undef MAYBE_special_cases +#undef SQRLO_BASECASE_ALLOC +#undef SQRLO_SHORTCUT_MULTIPLICATIONS +#undef MPN_SQR_DIAGONAL +#undef MPN_SQRLO_DIAGONAL +#undef MPN_SQRLO_DIAG_ADDLSH1 diff --git a/gmp-6.3.0/mpn/generic/sqrmod_bnm1.c b/gmp-6.3.0/mpn/generic/sqrmod_bnm1.c new file mode 100644 index 0000000..0acbe12 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqrmod_bnm1.c @@ -0,0 +1,328 @@ +/* sqrmod_bnm1.c -- squaring mod B^n-1. + + Contributed to the GNU project by Niels Möller, Torbjorn Granlund and + Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012, 2020, 2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +/* Input is {ap,rn}; output is {rp,rn}, computation is + mod B^rn - 1, and values are semi-normalised; zero is represented + as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp. + tp==rp is allowed. */ +static void +mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp) +{ + mp_limb_t cy; + + ASSERT (0 < rn); + + mpn_sqr (tp, ap, rn); + cy = mpn_add_n (rp, tp, tp + rn, rn); + /* If cy == 1, then the value of rp is at most B^rn - 2, so there can + * be no overflow when adding in the carry. */ + MPN_INCR_U (rp, rn, cy); +} + + +/* Input is {ap,rn+1}; output is {rp,rn+1}, in + normalised representation, computation is mod B^rn + 1. Needs + a scratch area of 2rn limbs at tp; tp == rp is allowed. + Output is normalised. */ +static void +mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp) +{ + mp_limb_t cy; + unsigned k; + + ASSERT (0 < rn); + + if (UNLIKELY (ap[rn])) + { + *rp = 1; + MPN_FILL (rp + 1, rn, 0); + return; + } + else if (MPN_SQRMOD_BKNP1_USABLE (rn, k, MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t n_k = rn / k; + TMP_DECL; + + TMP_MARK; + mpn_sqrmod_bknp1 (rp, ap, n_k, k, + TMP_ALLOC_LIMBS (mpn_sqrmod_bknp1_itch (rn))); + TMP_FREE; + return; + } + mpn_sqr (tp, ap, rn); + cy = mpn_sub_n (rp, tp, tp + rn, rn); + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, cy); +} + + +/* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1) + * + * The result is expected to be ZERO if and only if the operand + * already is. Otherwise the class [0] Mod(B^rn-1) is represented by + * B^rn-1. + * It should not be a problem if sqrmod_bnm1 is used to + * compute the full square with an <= 2*rn, because this condition + * implies (B^an-1)^2 < (B^rn-1) . + * + * Requires rn/4 < an <= rn + * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives + * + * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4 + */ +void +mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp) +{ + ASSERT (0 < an); + ASSERT (an <= rn); + + if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD)) + { + if (UNLIKELY (an < rn)) + { + if (UNLIKELY (2*an <= rn)) + { + mpn_sqr (rp, ap, an); + } + else + { + mp_limb_t cy; + mpn_sqr (tp, ap, an); + cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn); + MPN_INCR_U (rp, rn, cy); + } + } + else + mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp); + } + else + { + mp_size_t n; + mp_limb_t cy; + mp_limb_t hi; + + n = rn >> 1; + + ASSERT (2*an > n); + + /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1) + and crt together as + + x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)] + */ + +#define a0 ap +#define a1 (ap + n) + +#define xp tp /* 2n + 2 */ + /* am1 maybe in {xp, n} */ +#define sp1 (tp + 2*n + 2) + /* ap1 maybe in {sp1, n + 1} */ + + { + mp_srcptr am1; + mp_size_t anm; + mp_ptr so; + + if (LIKELY (an > n)) + { + so = xp + n; + am1 = xp; + cy = mpn_add (xp, a0, n, a1, an - n); + MPN_INCR_U (xp, n, cy); + anm = n; + } + else + { + so = xp; + am1 = a0; + anm = an; + } + + mpn_sqrmod_bnm1 (rp, n, am1, anm, so); + } + + { + int k; + mp_srcptr ap1; + mp_size_t anp; + + if (LIKELY (an > n)) { + ap1 = sp1; + cy = mpn_sub (sp1, a0, n, a1, an - n); + sp1[n] = 0; + MPN_INCR_U (sp1, n + 1, cy); + anp = n + ap1[n]; + } else { + ap1 = a0; + anp = an; + } + + if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD)) + k=0; + else + { + int mask; + k = mpn_fft_best_k (n, 1); + mask = (1<>=1;}; + } + if (k >= FFT_FIRST_K) + xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k); + else if (UNLIKELY (ap1 == a0)) + { + ASSERT (anp <= n); + ASSERT (2*anp > n); + mpn_sqr (xp, a0, an); + anp = 2*an - n; + cy = mpn_sub (xp, xp, n, xp + n, anp); + xp[n] = 0; + MPN_INCR_U (xp, n+1, cy); + } + else + mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp); + } + + /* Here the CRT recomposition begins. + + xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1) + Division by 2 is a bitwise rotation. + + Assumes xp normalised mod (B^n+1). + + The residue class [0] is represented by [B^n-1]; except when + both input are ZERO. + */ + +#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc +#if HAVE_NATIVE_mpn_rsh1add_nc + cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */ + hi = cy << (GMP_NUMB_BITS - 1); + cy = 0; + /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi + overflows, i.e. a further increment will not overflow again. */ +#else /* ! _nc */ + cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */ + hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ + cy >>= 1; + /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that + the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */ +#endif +#if GMP_NAIL_BITS == 0 + add_ssaaaa(cy, rp[n-1], cy, rp[n-1], CNST_LIMB(0), hi); +#else + cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1); + rp[n-1] ^= hi; +#endif +#else /* ! HAVE_NATIVE_mpn_rsh1add_n */ +#if HAVE_NATIVE_mpn_add_nc + cy = mpn_add_nc(rp, rp, xp, n, xp[n]); +#else /* ! _nc */ + cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */ +#endif + cy += (rp[0]&1); + mpn_rshift(rp, rp, n, 1); + ASSERT (cy <= 2); + hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ + cy >>= 1; + /* We can have cy != 0 only if hi = 0... */ + ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0); + rp[n-1] |= hi; + /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */ +#endif + ASSERT (cy <= 1); + /* Next increment can not overflow, read the previous comments about cy. */ + ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0)); + MPN_INCR_U(rp, n, cy); + + /* Compute the highest half: + ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n + */ + if (UNLIKELY (2*an < rn)) + { + /* Note that in this case, the only way the result can equal + zero mod B^{rn} - 1 is if the input is zero, and + then the output of both the recursive calls and this CRT + reconstruction is zero, not B^{rn} - 1. */ + cy = mpn_sub_n (rp + n, rp, xp, 2*an - n); + + /* FIXME: This subtraction of the high parts is not really + necessary, we do it to get the carry out, and for sanity + checking. */ + cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n, + xp + 2*an - n, rn - 2*an, cy); + ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an)); + cy = mpn_sub_1 (rp, rp, 2*an, cy); + ASSERT (cy == (xp + 2*an - n)[0]); + } + else + { + cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n); + /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO. + DECR will affect _at most_ the lowest n limbs. */ + MPN_DECR_U (rp, 2*n, cy); + } +#undef a0 +#undef a1 +#undef xp +#undef sp1 + } +} + +mp_size_t +mpn_sqrmod_bnm1_next_size (mp_size_t n) +{ + mp_size_t nh; + + if (BELOW_THRESHOLD (n, SQRMOD_BNM1_THRESHOLD)) + return n; + if (BELOW_THRESHOLD (n, 4 * (SQRMOD_BNM1_THRESHOLD - 1) + 1)) + return (n + (2-1)) & (-2); + if (BELOW_THRESHOLD (n, 8 * (SQRMOD_BNM1_THRESHOLD - 1) + 1)) + return (n + (4-1)) & (-4); + + nh = (n + 1) >> 1; + + if (BELOW_THRESHOLD (nh, SQR_FFT_MODF_THRESHOLD)) + return (n + (8-1)) & (-8); + + return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 1)); +} diff --git a/gmp-6.3.0/mpn/generic/sqrtrem.c b/gmp-6.3.0/mpn/generic/sqrtrem.c new file mode 100644 index 0000000..cc6dd9c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqrtrem.c @@ -0,0 +1,555 @@ +/* mpn_sqrtrem -- square root and remainder + + Contributed to the GNU project by Paul Zimmermann (most code), + Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt). + + THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A + FUTURE GMP RELEASE. + +Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* See "Karatsuba Square Root", reference in gmp.texi. */ + + +#include +#include + +#include "gmp-impl.h" +#include "longlong.h" +#define USE_DIVAPPR_Q 1 +#define TRACE(x) + +static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */ +{ + 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */ + 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */ + 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */ + 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */ + 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */ + 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */ + 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */ + 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */ + 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */ + 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */ + 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */ + 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */ + 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */ + 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */ + 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */ + 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */ + 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */ + 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */ + 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */ + 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */ + 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */ + 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */ + 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */ + 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */ + 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */ + 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */ + 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */ + 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */ + 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */ + 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */ + 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */ + 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */ + 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */ + 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */ + 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */ + 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */ + 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */ + 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */ + 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */ + 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */ + 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */ + 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */ + 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */ + 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */ + 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */ + 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */ + 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */ + 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */ +}; + +/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */ + +#if GMP_NUMB_BITS > 32 +#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */ +#else +#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */ +#endif + +static mp_limb_t +mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0) +{ +#if GMP_NUMB_BITS > 32 + mp_limb_t a1; +#endif + mp_limb_t x0, t2, t, x2; + unsigned abits; + + ASSERT_ALWAYS (GMP_NAIL_BITS == 0); + ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64); + ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2); + + /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a), + since we can do the former without division. As part of the last + iteration convert from 1/sqrt(a) to sqrt(a). */ + + abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */ + x0 = 0x100 | invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */ + + /* x0 is now an 8 bits approximation of 1/sqrt(a0) */ + +#if GMP_NUMB_BITS > 32 + a1 = a0 >> (GMP_LIMB_BITS - 1 - 32); + t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16; + x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2)); + + /* x0 is now a 16 bits approximation of 1/sqrt(a0) */ + + t2 = x0 * (a0 >> (32-8)); + t = t2 >> 25; + t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8)); + x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15); + x0 >>= 32; +#else + t2 = x0 * (a0 >> (16-8)); + t = t2 >> 13; + t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8)); + x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7); + x0 >>= 16; +#endif + + /* x0 is now a full limb approximation of sqrt(a0) */ + + x2 = x0 * x0; + if (x2 + 2*x0 <= a0 - 1) + { + x2 += 2*x0 + 1; + x0++; + } + + *rp = a0 - x2; + return x0; +} + + +#define Prec (GMP_NUMB_BITS >> 1) +#if ! defined(SQRTREM2_INPLACE) +#define SQRTREM2_INPLACE 0 +#endif + +/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized + return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */ +#if SQRTREM2_INPLACE +#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp) +static mp_limb_t +mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp) +{ + mp_srcptr np = rp; +#else +#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp) +static mp_limb_t +mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np) +{ +#endif + mp_limb_t q, u, np0, sp0, rp0, q2; + int cc; + + ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2); + + np0 = np[0]; + sp0 = mpn_sqrtrem1 (rp, np[1]); + rp0 = rp[0]; + /* rp0 <= 2*sp0 < 2^(Prec + 1) */ + rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1)); + q = rp0 / sp0; + /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */ + q -= q >> Prec; + /* now we have q < 2^Prec */ + u = rp0 - q * sp0; + /* now we have (rp[0]<>Prec)/2 = q * sp0 + u */ + sp0 = (sp0 << Prec) | q; + cc = u >> (Prec - 1); + rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1)); + /* subtract q * q from rp */ + q2 = q * q; + cc -= rp0 < q2; + rp0 -= q2; + if (cc < 0) + { + rp0 += sp0; + cc += rp0 < sp0; + --sp0; + rp0 += sp0; + cc += rp0 < sp0; + } + + rp[0] = rp0; + sp[0] = sp0; + return cc; +} + +/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n}, + and in {np, n} the low n limbs of the remainder, returns the high + limb of the remainder (which is 0 or 1). + Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4 + where B=2^GMP_NUMB_BITS. + Needs a scratch of n/2+1 limbs. */ +static mp_limb_t +mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch) +{ + mp_limb_t q; /* carry out of {sp, n} */ + int c, b; /* carry out of remainder */ + mp_size_t l, h; + + ASSERT (n > 1); + ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2); + + l = n / 2; + h = n - l; + if (h == 1) + q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l); + else + q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch); + if (q != 0) + ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h)); + TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1))); + mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h); + q += scratch[l]; + c = scratch[0] & 1; + mpn_rshift (sp, scratch, l, 1); + sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; + if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */ + return 1; /* Remainder is non-zero */ + q >>= 1; + if (c != 0) + c = mpn_add_n (np + l, np + l, sp + l, h); + TRACE(printf("sqr(,,%u)\n", (unsigned) l)); + mpn_sqr (np + n, sp, l); + b = q + mpn_sub_n (np, np, np + n, 2 * l); + c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b); + + if (c < 0) + { + q = mpn_add_1 (sp + l, sp + l, h, q); +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n + c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q; +#else + c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q; +#endif + c -= mpn_sub_1 (np, np, n, CNST_LIMB(1)); + q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1)); + } + + return c; +} + +#if USE_DIVAPPR_Q +static void +mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch) +{ + gmp_pi1_t inv; + mp_limb_t qh; + ASSERT (dn > 2); + ASSERT (nn >= dn); + ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); + + MPN_COPY (scratch, np, nn); + invert_pi1 (inv, dp[dn-1], dp[dn-2]); + if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD)) + qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32); + else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD)) + qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv); + else + { + mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0); + TMP_DECL; + TMP_MARK; + /* Sadly, scratch is too small. */ + qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch)); + TMP_FREE; + } + qp [nn - dn] = qh; +} +#endif + +/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd}, + returns zero if the operand was a perfect square, one otherwise. + Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4 + where B=2^GMP_NUMB_BITS. + THINK: In the odd case, three more (dummy) limbs are taken into account, + when nsh is maximal, two limbs are discarded from the result of the + division. Too much? Is a single dummy limb enough? */ +static int +mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd) +{ + mp_limb_t q; /* carry out of {sp, n} */ + int c; /* carry out of remainder */ + mp_size_t l, h; + mp_ptr qp, tp, scratch; + TMP_DECL; + TMP_MARK; + + ASSERT (np[2 * n - 1 - odd] != 0); + ASSERT (n > 4); + ASSERT (nsh < GMP_NUMB_BITS / 2); + + l = (n - 1) / 2; + h = n - l; + ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd); + scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */ + tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */ + if (nsh != 0) + { + /* o is used to exactly set the lowest bits of the dividend, is it needed? */ + int o = l > (1 + odd); + ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh)); + } + else + MPN_COPY (tp, np + l - 1 - odd, n + h + 1); + q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch); + if (q != 0) + ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h)); + qp = tp + n + 1; /* l + 2 */ + TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1))); +#if USE_DIVAPPR_Q + mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch); +#else + mpn_div_q (qp, tp, n + 1, sp + l, h, scratch); +#endif + q += qp [l + 1]; + c = 1; + if (q > 1) + { + /* FIXME: if s!=0 we will shift later, a noop on this area. */ + MPN_FILL (sp, l, GMP_NUMB_MAX); + } + else + { + /* FIXME: if s!=0 we will shift again later, shift just once. */ + mpn_rshift (sp, qp + 1, l, 1); + sp[l - 1] |= q << (GMP_NUMB_BITS - 1); + if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */ + (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0) + { + mp_limb_t cy; + /* Approximation is not good enough, the extra limb(+ nsh bits) + is smaller than needed to absorb the possible error. */ + /* {qp + 1, l + 1} equals 2*{sp, l} */ + /* FIXME: use mullo or wrap-around, or directly evaluate + remainder with a single sqrmod_bnm1. */ + TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1))); + ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1)); + /* Compute the remainder of the previous mpn_div(appr)_q. */ + cy = mpn_sub_n (tp + 1, tp + 1, scratch, h); +#if USE_DIVAPPR_Q || WANT_ASSERT + MPN_DECR_U (tp + 1 + h, l, cy); +#if USE_DIVAPPR_Q + ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0); + if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0) + { + /* May happen only if div result was not exact. */ +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h); +#else + cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2)); +#endif + ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy)); + MPN_DECR_U (sp, l, 1); + } + /* Can the root be exact when a correction was needed? We + did not find an example, but it depends on divappr + internals, and we can not assume it true in general...*/ + /* else */ +#else /* WANT_ASSERT */ + ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0); +#endif +#endif + if (mpn_zero_p (tp + l + 1, h - l)) + { + TRACE(printf("sqr(,,%u)\n", (unsigned) l)); + mpn_sqr (scratch, sp, l); + c = mpn_cmp (tp + 1, scratch + l, l); + if (c == 0) + { + if (nsh != 0) + { + mpn_lshift (tp, np, l, 2 * nsh); + np = tp; + } + c = mpn_cmp (np, scratch + odd, l - odd); + } + if (c < 0) + { + MPN_DECR_U (sp, l, 1); + c = 1; + } + } + } + } + TMP_FREE; + + if ((odd | nsh) != 0) + mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0)); + return c; +} + + +mp_size_t +mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn) +{ + mp_limb_t cc, high, rl; + int c; + mp_size_t rn, tn; + TMP_DECL; + + ASSERT (nn > 0); + ASSERT_MPN (np, nn); + + ASSERT (np[nn - 1] != 0); + ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn)); + ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn)); + ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn)); + + high = np[nn - 1]; + if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2))) + c = 0; + else + { + count_leading_zeros (c, high); + c -= GMP_NAIL_BITS; + + c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */ + } + if (nn == 1) { + if (c == 0) + { + sp[0] = mpn_sqrtrem1 (&rl, high); + if (rp != NULL) + rp[0] = rl; + } + else + { + cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c; + sp[0] = cc; + if (rp != NULL) + rp[0] = rl = high - cc*cc; + } + return rl != 0; + } + if (nn == 2) { + mp_limb_t tp [2]; + if (rp == NULL) rp = tp; + if (c == 0) + { +#if SQRTREM2_INPLACE + rp[1] = high; + rp[0] = np[0]; + cc = CALL_SQRTREM2_INPLACE (sp, rp); +#else + cc = mpn_sqrtrem2 (sp, rp, np); +#endif + rp[1] = cc; + return ((rp[0] | cc) != 0) + cc; + } + else + { + rl = np[0]; + rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c)); + rp[0] = rl << (2*c); + CALL_SQRTREM2_INPLACE (sp, rp); + cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */ + rp[0] = rl -= cc*cc; /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */ + return rl != 0; + } + } + tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */ + + if ((rp == NULL) && (nn > 8)) + return mpn_dc_sqrt (sp, np, tn, c, nn & 1); + TMP_MARK; + if (((nn & 1) | c) != 0) + { + mp_limb_t s0[1], mask; + mp_ptr tp, scratch; + TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1); + tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */ + if (c != 0) + mpn_lshift (tp + (nn & 1), np, nn, 2 * c); + else + MPN_COPY (tp + (nn & 1), np, nn); + c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */ + mask = (CNST_LIMB (1) << c) - 1; + rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch); + /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2, + thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */ + s0[0] = sp[0] & mask; /* S mod 2^k */ + rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */ + cc = mpn_submul_1 (tp, s0, 1, s0[0]); + rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc; + mpn_rshift (sp, sp, tn, c); + tp[tn] = rl; + if (rp == NULL) + rp = tp; + c = c << 1; + if (c < GMP_NUMB_BITS) + tn++; + else + { + tp++; + c -= GMP_NUMB_BITS; + } + if (c != 0) + mpn_rshift (rp, tp, tn, c); + else + MPN_COPY_INCR (rp, tp, tn); + rn = tn; + } + else + { + if (rp != np) + { + if (rp == NULL) /* nn <= 8 */ + rp = TMP_SALLOC_LIMBS (nn); + MPN_COPY (rp, np, nn); + } + rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1))); + } + + MPN_NORMALIZE (rp, rn); + + TMP_FREE; + return rn; +} diff --git a/gmp-6.3.0/mpn/generic/strongfibo.c b/gmp-6.3.0/mpn/generic/strongfibo.c new file mode 100644 index 0000000..7e8d612 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/strongfibo.c @@ -0,0 +1,219 @@ +/* mpn_fib2m -- calculate Fibonacci numbers, modulo m. + +Contributed to the GNU project by Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2018, 2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + + +#if ! HAVE_NATIVE_mpn_rsblsh1_n && ! HAVE_NATIVE_mpn_sublsh1_n +/* Stores |{ap,n}-{bp,n}| in {rp,n}, + returns the sign of {ap,n}-{bp,n}. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + mp_limb_t x, y; + while (--n >= 0) + { + x = ap[n]; + y = bp[n]; + if (x != y) + { + ++n; + if (x > y) + { + ASSERT_NOCARRY (mpn_sub_n (rp, ap, bp, n)); + return 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (rp, bp, ap, n)); + return -1; + } + } + rp[n] = 0; + } + return 0; +} +#endif + +/* Computes at most count terms of the sequence needed by the + Lucas-Lehmer-Riesel test, indexing backward: + L_i = L_{i+1}^2 - 2 + + The sequence is computed modulo M = {mp, mn}. + The starting point is given in L_{count+1} = {lp, mn}. + The scratch pointed by sp, needs a space of at least 3 * mn + 1 limbs. + + Returns the index i>0 if L_i = 0 (mod M) is found within the + computed count terms of the sequence. Otherwise it returns zero. + + Note: (+/-2)^2-2=2, (+/-1)^2-2=-1, 0^2-2=-2 + */ + +static mp_bitcnt_t +mpn_llriter (mp_ptr lp, mp_srcptr mp, mp_size_t mn, mp_bitcnt_t count, mp_ptr sp) +{ + do + { + mpn_sqr (sp, lp, mn); + mpn_tdiv_qr (sp + 2 * mn, lp, 0, sp, 2 * mn, mp, mn); + if (lp[0] < 5) + { + /* If L^2 % M < 5, |L^2 % M - 2| <= 2 */ + if (mn == 1 || mpn_zero_p (lp + 1, mn - 1)) + return (lp[0] == 2) ? count : 0; + else + MPN_DECR_U (lp, mn, 2); + } + else + lp[0] -= 2; + } while (--count != 0); + return 0; +} + +/* Store the Lucas' number L[n] at lp (maybe), computed modulo m. lp + and scratch should have room for mn*2+1 limbs. + + Returns the size of L[n] normally. + + If F[n] is zero modulo m, or L[n] is, returns 0 and lp is + undefined. +*/ + +static mp_size_t +mpn_lucm (mp_ptr lp, mp_srcptr np, mp_size_t nn, mp_srcptr mp, mp_size_t mn, mp_ptr scratch) +{ + int neg; + mp_limb_t cy; + + ASSERT (! MPN_OVERLAP_P (lp, MAX(2*mn+1,5), scratch, MAX(2*mn+1,5))); + ASSERT (nn > 0); + + neg = mpn_fib2m (lp, scratch, np, nn, mp, mn); + + /* F[n] = +/-{lp, mn}, F[n-1] = +/-{scratch, mn} */ + if (mpn_zero_p (lp, mn)) + return 0; + + if (neg) /* One sign is opposite, use sub instead of add. */ + { +#if HAVE_NATIVE_mpn_rsblsh1_n || HAVE_NATIVE_mpn_sublsh1_n +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_rsblsh1_n (lp, lp, scratch, mn); /* L[n] = +/-(2F[n-1]-(-F[n])) */ +#else + cy = mpn_sublsh1_n (lp, lp, scratch, mn); /* L[n] = -/+(F[n]-(-2F[n-1])) */ + if (cy != 0) + cy = mpn_add_n (lp, lp, mp, mn) - cy; +#endif + if (cy > 1) + cy += mpn_add_n (lp, lp, mp, mn); +#else + cy = mpn_lshift (scratch, scratch, mn, 1); /* 2F[n-1] */ + if (UNLIKELY (cy)) + cy -= mpn_sub_n (lp, scratch, lp, mn); /* L[n] = +/-(2F[n-1]-(-F[n])) */ + else + abs_sub_n (lp, lp, scratch, mn); +#endif + ASSERT (cy <= 1); + } + else + { +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (lp, lp, scratch, mn); /* L[n] = +/-(2F[n-1]+F[n])) */ +#else + cy = mpn_lshift (scratch, scratch, mn, 1); + cy+= mpn_add_n (lp, lp, scratch, mn); +#endif + ASSERT (cy <= 2); + } + while (cy || mpn_cmp (lp, mp, mn) >= 0) + cy -= mpn_sub_n (lp, lp, mp, mn); + MPN_NORMALIZE (lp, mn); + return mn; +} + +int +mpn_strongfibo (mp_srcptr mp, mp_size_t mn, mp_ptr scratch) +{ + mp_ptr lp, sp; + mp_size_t en; + mp_bitcnt_t b0; + TMP_DECL; + +#if GMP_NUMB_BITS % 4 == 0 + b0 = mpn_scan0 (mp, 0); +#else + { + mpz_t m = MPZ_ROINIT_N(mp, mn); + b0 = mpz_scan0 (m, 0); + } + if (UNLIKELY (b0 == mn * GMP_NUMB_BITS)) + { + en = 1; + scratch [0] = 1; + } + else +#endif + { + int cnt = b0 % GMP_NUMB_BITS; + en = b0 / GMP_NUMB_BITS; + if (LIKELY (cnt != 0)) + mpn_rshift (scratch, mp + en, mn - en, cnt); + else + MPN_COPY (scratch, mp + en, mn - en); + en = mn - en; + scratch [0] |= 1; + en -= scratch [en - 1] == 0; + } + TMP_MARK; + + lp = TMP_ALLOC_LIMBS (4 * mn + 6); + sp = lp + 2 * mn + 3; + en = mpn_lucm (sp, scratch, en, mp, mn, lp); + if (en != 0 && LIKELY (--b0 != 0)) + { + mpn_sqr (lp, sp, en); + lp [0] |= 2; /* V^2 + 2 */ + if (LIKELY (2 * en >= mn)) + mpn_tdiv_qr (sp, lp, 0, lp, 2 * en, mp, mn); + else + MPN_ZERO (lp + 2 * en, mn - 2 * en); + if (! mpn_zero_p (lp, mn) && LIKELY (--b0 != 0)) + b0 = mpn_llriter (lp, mp, mn, b0, lp + mn + 1); + } + TMP_FREE; + return (b0 != 0); +} diff --git a/gmp-6.3.0/mpn/generic/sub.c b/gmp-6.3.0/mpn/generic/sub.c new file mode 100644 index 0000000..df0afd6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub.c @@ -0,0 +1,33 @@ +/* mpn_sub - subtract mpn from mpn. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_sub 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/sub_1.c b/gmp-6.3.0/mpn/generic/sub_1.c new file mode 100644 index 0000000..a20f191 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_1.c @@ -0,0 +1,33 @@ +/* mpn_sub_1 - subtract limb from mpn. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_sub_1 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/sub_err1_n.c b/gmp-6.3.0/mpn/generic/sub_err1_n.c new file mode 100644 index 0000000..beca57e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_err1_n.c @@ -0,0 +1,100 @@ +/* mpn_sub_err1_n -- sub_n with one error term + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy, + return value is borrow out. + + (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy). + Computes c[1]*yp[n-1] + ... + c[n]*yp[0], stores two-limb result at ep. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_sub_err1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el, eh, ul, vl, yl, zl, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, yp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, rp, n)); + + yp += n - 1; + el = eh = 0; + + do + { + yl = *yp--; + ul = *up++; + vl = *vp++; + + /* ordinary sub_n */ + SUBC_LIMB (cy1, sl, ul, vl); + SUBC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh:el) */ + zl = (-cy) & yl; + el += zl; + eh += el < zl; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh = (eh << GMP_NAIL_BITS) + (el >> GMP_NUMB_BITS); + el &= GMP_NUMB_MASK; +#endif + + ep[0] = el; + ep[1] = eh; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sub_err2_n.c b/gmp-6.3.0/mpn/generic/sub_err2_n.c new file mode 100644 index 0000000..1edf8d6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_err2_n.c @@ -0,0 +1,116 @@ +/* mpn_sub_err2_n -- sub_n with two error terms + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy, + return value is borrow out. + + (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy). + Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0], + c[1]*yp2[n-1] + ... + c[n]*yp2[0], + stores two-limb results at {ep,2} and {ep+2,2} respectively. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_sub_err2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el1, eh1, el2, eh2, ul, vl, yl1, yl2, zl1, zl2, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, yp1, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, rp, n)); + + yp1 += n - 1; + yp2 += n - 1; + el1 = eh1 = 0; + el2 = eh2 = 0; + + do + { + yl1 = *yp1--; + yl2 = *yp2--; + ul = *up++; + vl = *vp++; + + /* ordinary sub_n */ + SUBC_LIMB (cy1, sl, ul, vl); + SUBC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh1:el1) */ + zl1 = (-cy) & yl1; + el1 += zl1; + eh1 += el1 < zl1; + + /* update (eh2:el2) */ + zl2 = (-cy) & yl2; + el2 += zl2; + eh2 += el2 < zl2; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS); + el1 &= GMP_NUMB_MASK; + eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS); + el2 &= GMP_NUMB_MASK; +#endif + + ep[0] = el1; + ep[1] = eh1; + ep[2] = el2; + ep[3] = eh2; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sub_err3_n.c b/gmp-6.3.0/mpn/generic/sub_err3_n.c new file mode 100644 index 0000000..2db3c63 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_err3_n.c @@ -0,0 +1,131 @@ +/* mpn_sub_err3_n -- sub_n with three error terms + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy, + return value is borrow out. + + (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy). + Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0], + c[1]*yp2[n-1] + ... + c[n]*yp2[0], + c[1]*yp3[n-1] + ... + c[n]*yp3[0], + stores two-limb results at {ep,2}, {ep+2,2} and {ep+4,2} respectively. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_sub_err3_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, mp_srcptr yp3, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el1, eh1, el2, eh2, el3, eh3, ul, vl, yl1, yl2, yl3, zl1, zl2, zl3, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp3, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp1, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp3, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, rp, n)); + + yp1 += n - 1; + yp2 += n - 1; + yp3 += n - 1; + el1 = eh1 = 0; + el2 = eh2 = 0; + el3 = eh3 = 0; + + do + { + yl1 = *yp1--; + yl2 = *yp2--; + yl3 = *yp3--; + ul = *up++; + vl = *vp++; + + /* ordinary sub_n */ + SUBC_LIMB (cy1, sl, ul, vl); + SUBC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh1:el1) */ + zl1 = (-cy) & yl1; + el1 += zl1; + eh1 += el1 < zl1; + + /* update (eh2:el2) */ + zl2 = (-cy) & yl2; + el2 += zl2; + eh2 += el2 < zl2; + + /* update (eh3:el3) */ + zl3 = (-cy) & yl3; + el3 += zl3; + eh3 += el3 < zl3; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS); + el1 &= GMP_NUMB_MASK; + eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS); + el2 &= GMP_NUMB_MASK; + eh3 = (eh3 << GMP_NAIL_BITS) + (el3 >> GMP_NUMB_BITS); + el3 &= GMP_NUMB_MASK; +#endif + + ep[0] = el1; + ep[1] = eh1; + ep[2] = el2; + ep[3] = eh2; + ep[4] = el3; + ep[5] = eh3; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sub_n.c b/gmp-6.3.0/mpn/generic/sub_n.c new file mode 100644 index 0000000..b192c96 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_n.c @@ -0,0 +1,89 @@ +/* mpn_sub_n -- Subtract equal length limb vectors. + +Copyright 1992-1994, 1996, 2000, 2002, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, sl, rl, cy, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n)); + + cy = 0; + do + { + ul = *up++; + vl = *vp++; + sl = ul - vl; + cy1 = sl > ul; + rl = sl - cy; + cy2 = rl > sl; + cy = cy1 | cy2; + *rp++ = rl; + } + while (--n != 0); + + return cy; +} + +#endif + +#if GMP_NAIL_BITS >= 1 + +mp_limb_t +mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, rl, cy; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n)); + + cy = 0; + do + { + ul = *up++; + vl = *vp++; + rl = ul - vl - cy; + cy = rl >> (GMP_LIMB_BITS - 1); + *rp++ = rl & GMP_NUMB_MASK; + } + while (--n != 0); + + return cy; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/submul_1.c b/gmp-6.3.0/mpn/generic/submul_1.c new file mode 100644 index 0000000..4744274 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/submul_1.c @@ -0,0 +1,144 @@ +/* mpn_submul_1 -- multiply the N long limb vector pointed to by UP by VL, + subtract the N least significant limbs of the product from the limb + vector pointed to by RP. Return the most significant limb of the + product, adjusted for carry-out from the subtraction. + +Copyright 1992-1994, 1996, 2000, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t u0, crec, c, p1, p0, r0; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + + crec = 0; + do + { + u0 = *up++; + umul_ppmm (p1, p0, u0, v0); + + r0 = *rp; + + p0 = r0 - p0; + c = r0 < p0; + + p1 = p1 + c; + + r0 = p0 - crec; /* cycle 0, 3, ... */ + c = p0 < r0; /* cycle 1, 4, ... */ + + crec = p1 + c; /* cycle 2, 5, ... */ + + *rp++ = r0; + } + while (--n != 0); + + return crec; +} + +#endif + +#if GMP_NAIL_BITS == 1 + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, cl, xl, c1, c2, c3; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (rp, n); + ASSERT_MPN (up, n); + ASSERT_LIMB (v0); + + shifted_v0 = v0 << GMP_NAIL_BITS; + cl = 0; + prev_p1 = 0; + do + { + u0 = *up++; + r0 = *rp; + umul_ppmm (p1, p0, u0, shifted_v0); + p0 >>= GMP_NAIL_BITS; + SUBC_LIMB (c1, xl, r0, prev_p1); + SUBC_LIMB (c2, xl, xl, p0); + SUBC_LIMB (c3, xl, xl, cl); + cl = c1 + c2 + c3; + *rp++ = xl; + prev_p1 = p1; + } + while (--n != 0); + + return prev_p1 + cl; +} + +#endif + +#if GMP_NAIL_BITS >= 2 + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, xw, cl, xl; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (rp, n); + ASSERT_MPN (up, n); + ASSERT_LIMB (v0); + + shifted_v0 = v0 << GMP_NAIL_BITS; + cl = 0; + prev_p1 = 0; + do + { + u0 = *up++; + r0 = *rp; + umul_ppmm (p1, p0, u0, shifted_v0); + p0 >>= GMP_NAIL_BITS; + xw = r0 - (prev_p1 + p0) + cl; + cl = (mp_limb_signed_t) xw >> GMP_NUMB_BITS; /* FIXME: non-portable */ + xl = xw & GMP_NUMB_MASK; + *rp++ = xl; + prev_p1 = p1; + } + while (--n != 0); + + return prev_p1 - cl; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/tdiv_qr.c b/gmp-6.3.0/mpn/generic/tdiv_qr.c new file mode 100644 index 0000000..92ff33c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/tdiv_qr.c @@ -0,0 +1,386 @@ +/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and + write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If + qxn is non-zero, generate that many fraction limbs and append them after the + other quotient limbs, and update the remainder accordingly. The input + operands are unaffected. + + Preconditions: + 1. The most significant limb of the divisor must be non-zero. + 2. nn >= dn, even if qxn is non-zero. (??? relax this ???) + + The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time + complexity of multiplication. + +Copyright 1997, 2000-2002, 2005, 2009, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +void +mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn, + mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) +{ + ASSERT_ALWAYS (qxn == 0); + + ASSERT (nn >= 0); + ASSERT (dn >= 0); + ASSERT (dn == 0 || dp[dn - 1] != 0); + ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, np, nn)); + ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, dp, dn)); + + switch (dn) + { + case 0: + DIVIDE_BY_ZERO; + + case 1: + { + rp[0] = mpn_divrem_1 (qp, (mp_size_t) 0, np, nn, dp[0]); + return; + } + + case 2: + { + mp_ptr n2p; + mp_limb_t qhl, cy; + TMP_DECL; + TMP_MARK; + if ((dp[1] & GMP_NUMB_HIGHBIT) == 0) + { + int cnt; + mp_limb_t d2p[2]; + count_leading_zeros (cnt, dp[1]); + cnt -= GMP_NAIL_BITS; + d2p[1] = (dp[1] << cnt) | (dp[0] >> (GMP_NUMB_BITS - cnt)); + d2p[0] = (dp[0] << cnt) & GMP_NUMB_MASK; + n2p = TMP_ALLOC_LIMBS (nn + 1); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p); + if (cy == 0) + qp[nn - 2] = qhl; /* always store nn-2+1 quotient limbs */ + rp[0] = (n2p[0] >> cnt) + | ((n2p[1] << (GMP_NUMB_BITS - cnt)) & GMP_NUMB_MASK); + rp[1] = (n2p[1] >> cnt); + } + else + { + n2p = TMP_ALLOC_LIMBS (nn); + MPN_COPY (n2p, np, nn); + qhl = mpn_divrem_2 (qp, 0L, n2p, nn, dp); + qp[nn - 2] = qhl; /* always store nn-2+1 quotient limbs */ + rp[0] = n2p[0]; + rp[1] = n2p[1]; + } + TMP_FREE; + return; + } + + default: + { + int adjust; + gmp_pi1_t dinv; + TMP_DECL; + TMP_MARK; + adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */ + if (nn + adjust >= 2 * dn) + { + mp_ptr n2p, d2p; + mp_limb_t cy; + int cnt; + + qp[nn - dn] = 0; /* zero high quotient limb */ + if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0) /* normalize divisor */ + { + count_leading_zeros (cnt, dp[dn - 1]); + cnt -= GMP_NAIL_BITS; + d2p = TMP_ALLOC_LIMBS (dn); + mpn_lshift (d2p, dp, dn, cnt); + n2p = TMP_ALLOC_LIMBS (nn + 1); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + nn += adjust; + } + else + { + cnt = 0; + d2p = (mp_ptr) dp; + n2p = TMP_ALLOC_LIMBS (nn + 1); + MPN_COPY (n2p, np, nn); + n2p[nn] = 0; + nn += adjust; + } + + invert_pi1 (dinv, d2p[dn - 1], d2p[dn - 2]); + if (BELOW_THRESHOLD (dn, DC_DIV_QR_THRESHOLD)) + mpn_sbpi1_div_qr (qp, n2p, nn, d2p, dn, dinv.inv32); + else if (BELOW_THRESHOLD (dn, MUPI_DIV_QR_THRESHOLD) || /* fast condition */ + BELOW_THRESHOLD (nn, 2 * MU_DIV_QR_THRESHOLD) || /* fast condition */ + (double) (2 * (MU_DIV_QR_THRESHOLD - MUPI_DIV_QR_THRESHOLD)) * dn /* slow... */ + + (double) MUPI_DIV_QR_THRESHOLD * nn > (double) dn * nn) /* ...condition */ + mpn_dcpi1_div_qr (qp, n2p, nn, d2p, dn, &dinv); + else + { + mp_size_t itch = mpn_mu_div_qr_itch (nn, dn, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + mpn_mu_div_qr (qp, rp, n2p, nn, d2p, dn, scratch); + n2p = rp; + } + + if (cnt != 0) + mpn_rshift (rp, n2p, dn, cnt); + else + MPN_COPY (rp, n2p, dn); + TMP_FREE; + return; + } + + /* When we come here, the numerator/partial remainder is less + than twice the size of the denominator. */ + + { + /* Problem: + + Divide a numerator N with nn limbs by a denominator D with dn + limbs forming a quotient of qn=nn-dn+1 limbs. When qn is small + compared to dn, conventional division algorithms perform poorly. + We want an algorithm that has an expected running time that is + dependent only on qn. + + Algorithm (very informally stated): + + 1) Divide the 2 x qn most significant limbs from the numerator + by the qn most significant limbs from the denominator. Call + the result qest. This is either the correct quotient, but + might be 1 or 2 too large. Compute the remainder from the + division. (This step is implemented by an mpn_divrem call.) + + 2) Is the most significant limb from the remainder < p, where p + is the product of the most significant limb from the quotient + and the next(d)? (Next(d) denotes the next ignored limb from + the denominator.) If it is, decrement qest, and adjust the + remainder accordingly. + + 3) Is the remainder >= qest? If it is, qest is the desired + quotient. The algorithm terminates. + + 4) Subtract qest x next(d) from the remainder. If there is + borrow out, decrement qest, and adjust the remainder + accordingly. + + 5) Skip one word from the denominator (i.e., let next(d) denote + the next less significant limb. */ + + mp_size_t qn; + mp_ptr n2p, d2p; + mp_ptr tp; + mp_limb_t cy; + mp_size_t in, rn; + mp_limb_t quotient_too_large; + unsigned int cnt; + + qn = nn - dn; + qp[qn] = 0; /* zero high quotient limb */ + qn += adjust; /* qn cannot become bigger */ + + if (qn == 0) + { + MPN_COPY (rp, np, dn); + TMP_FREE; + return; + } + + in = dn - qn; /* (at least partially) ignored # of limbs in ops */ + /* Normalize denominator by shifting it to the left such that its + most significant bit is set. Then shift the numerator the same + amount, to mathematically preserve quotient. */ + if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0) + { + count_leading_zeros (cnt, dp[dn - 1]); + cnt -= GMP_NAIL_BITS; + + d2p = TMP_ALLOC_LIMBS (qn); + mpn_lshift (d2p, dp + in, qn, cnt); + d2p[0] |= dp[in - 1] >> (GMP_NUMB_BITS - cnt); + + n2p = TMP_ALLOC_LIMBS (2 * qn + 1); + cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt); + if (adjust) + { + n2p[2 * qn] = cy; + n2p++; + } + else + { + n2p[0] |= np[nn - 2 * qn - 1] >> (GMP_NUMB_BITS - cnt); + } + } + else + { + cnt = 0; + d2p = (mp_ptr) dp + in; + + n2p = TMP_ALLOC_LIMBS (2 * qn + 1); + MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn); + if (adjust) + { + n2p[2 * qn] = 0; + n2p++; + } + } + + /* Get an approximate quotient using the extracted operands. */ + if (qn == 1) + { + mp_limb_t q0, r0; + udiv_qrnnd (q0, r0, n2p[1], n2p[0] << GMP_NAIL_BITS, d2p[0] << GMP_NAIL_BITS); + n2p[0] = r0 >> GMP_NAIL_BITS; + qp[0] = q0; + } + else if (qn == 2) + mpn_divrem_2 (qp, 0L, n2p, 4L, d2p); /* FIXME: obsolete function */ + else + { + invert_pi1 (dinv, d2p[qn - 1], d2p[qn - 2]); + if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) + mpn_sbpi1_div_qr (qp, n2p, 2 * qn, d2p, qn, dinv.inv32); + else if (BELOW_THRESHOLD (qn, MU_DIV_QR_THRESHOLD)) + mpn_dcpi1_div_qr (qp, n2p, 2 * qn, d2p, qn, &dinv); + else + { + mp_size_t itch = mpn_mu_div_qr_itch (2 * qn, qn, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + mp_ptr r2p = rp; + if (np == r2p) /* If N and R share space, put ... */ + r2p += nn - qn; /* intermediate remainder at N's upper end. */ + mpn_mu_div_qr (qp, r2p, n2p, 2 * qn, d2p, qn, scratch); + MPN_COPY (n2p, r2p, qn); + } + } + + rn = qn; + /* Multiply the first ignored divisor limb by the most significant + quotient limb. If that product is > the partial remainder's + most significant limb, we know the quotient is too large. This + test quickly catches most cases where the quotient is too large; + it catches all cases where the quotient is 2 too large. */ + { + mp_limb_t dl, x; + mp_limb_t h, dummy; + + if (in - 2 < 0) + dl = 0; + else + dl = dp[in - 2]; + +#if GMP_NAIL_BITS == 0 + x = (dp[in - 1] << cnt) | ((dl >> 1) >> ((~cnt) % GMP_LIMB_BITS)); +#else + x = (dp[in - 1] << cnt) & GMP_NUMB_MASK; + if (cnt != 0) + x |= dl >> (GMP_NUMB_BITS - cnt); +#endif + umul_ppmm (h, dummy, x, qp[qn - 1] << GMP_NAIL_BITS); + + if (n2p[qn - 1] < h) + { + mp_limb_t cy; + + mpn_decr_u (qp, (mp_limb_t) 1); + cy = mpn_add_n (n2p, n2p, d2p, qn); + if (cy) + { + /* The partial remainder is safely large. */ + n2p[qn] = cy; + ++rn; + } + } + } + + quotient_too_large = 0; + if (cnt != 0) + { + mp_limb_t cy1, cy2; + + /* Append partially used numerator limb to partial remainder. */ + cy1 = mpn_lshift (n2p, n2p, rn, GMP_NUMB_BITS - cnt); + n2p[0] |= np[in - 1] & (GMP_NUMB_MASK >> cnt); + + /* Update partial remainder with partially used divisor limb. */ + cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (GMP_NUMB_MASK >> cnt)); + if (qn != rn) + { + ASSERT_ALWAYS (n2p[qn] >= cy2); + n2p[qn] -= cy2; + } + else + { + n2p[qn] = cy1 - cy2; /* & GMP_NUMB_MASK; */ + + quotient_too_large = (cy1 < cy2); + ++rn; + } + --in; + } + /* True: partial remainder now is neutral, i.e., it is not shifted up. */ + + tp = TMP_ALLOC_LIMBS (dn); + + if (in < qn) + { + if (in == 0) + { + MPN_COPY (rp, n2p, rn); + ASSERT_ALWAYS (rn == dn); + goto foo; + } + mpn_mul (tp, qp, qn, dp, in); + } + else + mpn_mul (tp, dp, in, qp, qn); + + cy = mpn_sub (n2p, n2p, rn, tp + in, qn); + MPN_COPY (rp + in, n2p, dn - in); + quotient_too_large |= cy; + cy = mpn_sub_n (rp, np, tp, in); + cy = mpn_sub_1 (rp + in, rp + in, rn, cy); + quotient_too_large |= cy; + foo: + if (quotient_too_large) + { + mpn_decr_u (qp, (mp_limb_t) 1); + mpn_add_n (rp, rp, dp, dn); + } + } + TMP_FREE; + return; + } + } +} diff --git a/gmp-6.3.0/mpn/generic/toom22_mul.c b/gmp-6.3.0/mpn/generic/toom22_mul.c new file mode 100644 index 0000000..da56014 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom22_mul.c @@ -0,0 +1,222 @@ +/* mpn_toom22_mul -- Multiply {ap,an} and {bp,bn} where an >= bn. Or more + accurately, bn <= an < 2bn. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2012, 2014, 2018, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +inf + + <-s--><--n--> + ____ ______ + |_a1_|___a0_| + |b1_|___b0_| + <-t-><--n--> + + v0 = a0 * b0 # A(0)*B(0) + vm1 = (a0- a1)*(b0- b1) # A(-1)*B(-1) + vinf= a1 * b1 # A(inf)*B(inf) +*/ + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_mul_toom22 1 +#else +#define MAYBE_mul_toom22 \ + (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD) +#endif + +#define TOOM22_MUL_N_REC(p, a, b, n, ws) \ + do { \ + if (! MAYBE_mul_toom22 \ + || BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ + mpn_mul_basecase (p, a, n, b, n); \ + else \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + } while (0) + +/* Normally, this calls mul_basecase or toom22_mul. But when when the fraction + MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD is large, an initially small + relative unbalance will become a larger and larger relative unbalance with + each recursion (the difference s-t will be invariant over recursive calls). + Therefore, we need to call toom32_mul. FIXME: Suppress depending on + MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD and on MUL_TOOM22_THRESHOLD. */ +#define TOOM22_MUL_REC(p, a, an, b, bn, ws) \ + do { \ + if (! MAYBE_mul_toom22 \ + || BELOW_THRESHOLD (bn, MUL_TOOM22_THRESHOLD)) \ + mpn_mul_basecase (p, a, an, b, bn); \ + else if (4 * an < 5 * bn) \ + mpn_toom22_mul (p, a, an, b, bn, ws); \ + else \ + mpn_toom32_mul (p, a, an, b, bn, ws); \ + } while (0) + +void +mpn_toom22_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + const int __gmpn_cpuvec_initialized = 1; + mp_size_t n, s, t; + int vm1_neg; + mp_limb_t cy, cy2; + mp_ptr asm1; + mp_ptr bsm1; + +#define a0 ap +#define a1 (ap + n) +#define b0 bp +#define b1 (bp + n) + + s = an >> 1; + n = an - s; + t = bn - n; + + ASSERT (an >= bn); + + ASSERT (0 < s && s <= n && (n - s) == (an & 1)); + ASSERT (0 < t && t <= s); + + asm1 = pp; + bsm1 = pp + n; + + vm1_neg = 0; + + /* Compute asm1. */ + if ((an & 1) == 0) /* s == n */ + { + if (mpn_cmp (a0, a1, n) < 0) + { + mpn_sub_n (asm1, a1, a0, n); + vm1_neg = 1; + } + else + { + mpn_sub_n (asm1, a0, a1, n); + } + } + else /* n - s == 1 */ + { + if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0) + { + mpn_sub_n (asm1, a1, a0, s); + asm1[s] = 0; + vm1_neg = 1; + } + else + { + asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s); + } + } + + /* Compute bsm1. */ + if (t == n) + { + if (mpn_cmp (b0, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, b0, n); + vm1_neg ^= 1; + } + else + { + mpn_sub_n (bsm1, b0, b1, n); + } + } + else + { + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + mpn_sub_n (bsm1, b1, b0, t); + MPN_ZERO (bsm1 + t, n - t); + vm1_neg ^= 1; + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + } + } + +#define v0 pp /* 2n */ +#define vinf (pp + 2 * n) /* s+t */ +#define vm1 scratch /* 2n */ +#define scratch_out scratch + 2 * n + + /* vm1, 2n limbs */ + TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); + + if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out); + else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out); + + /* v0, 2n limbs */ + TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); + + /* H(v0) + L(vinf) */ + cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); + + /* L(v0) + (H(v0) + L(vinf)) */ + cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); + + /* (H(v0) + L(vinf)) + H(vinf) */ + cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n); + + if (vm1_neg) + cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n); + else { + cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); + if (UNLIKELY (cy + 1 == 0)) { /* cy is negative */ + /* The total contribution of v0+vinf-vm1 can not be negative. */ +#if WANT_ASSERT + /* The borrow in cy stops the propagation of the carry cy2, */ + ASSERT (cy2 == 1); + cy += mpn_add_1 (pp + 2 * n, pp + 2 * n, n, cy2); + ASSERT (cy == 0); +#else + /* we simply fill the area with zeros. */ + MPN_FILL (pp + 2 * n, n, 0); + /* ASSERT (s + t == n || mpn_zero_p (pp + 3 * n, s + t - n)); */ +#endif + return; + } + } + + ASSERT (cy <= 2); + ASSERT (cy2 <= 2); + + MPN_INCR_U (pp + 2 * n, s + t, cy2); + /* if s+t==n, cy is zero, but we should not access pp[3*n] at all. */ + MPN_INCR_U (pp + 3 * n, s + t - n, cy); +} diff --git a/gmp-6.3.0/mpn/generic/toom2_sqr.c b/gmp-6.3.0/mpn/generic/toom2_sqr.c new file mode 100644 index 0000000..db7a846 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom2_sqr.c @@ -0,0 +1,155 @@ +/* mpn_toom2_sqr -- Square {ap,an}. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2012, 2014, 2018, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +inf + + <-s--><--n--> + ____ ______ + |_a1_|___a0_| + + v0 = a0 ^2 # A(0)^2 + vm1 = (a0- a1)^2 # A(-1)^2 + vinf= a1 ^2 # A(inf)^2 +*/ + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_sqr_toom2 1 +#else +#define MAYBE_sqr_toom2 \ + (SQR_TOOM3_THRESHOLD >= 2 * SQR_TOOM2_THRESHOLD) +#endif + +#define TOOM2_SQR_REC(p, a, n, ws) \ + do { \ + if (! MAYBE_sqr_toom2 \ + || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \ + mpn_sqr_basecase (p, a, n); \ + else \ + mpn_toom2_sqr (p, a, n, ws); \ + } while (0) + +void +mpn_toom2_sqr (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_ptr scratch) +{ + const int __gmpn_cpuvec_initialized = 1; + mp_size_t n, s; + mp_limb_t cy, cy2; + mp_ptr asm1; + +#define a0 ap +#define a1 (ap + n) + + s = an >> 1; + n = an - s; + + ASSERT (0 < s && s <= n && (n - s) == (an & 1)); + + asm1 = pp; + + /* Compute asm1. */ + if ((an & 1) == 0) /* s == n */ + { + if (mpn_cmp (a0, a1, n) < 0) + { + mpn_sub_n (asm1, a1, a0, n); + } + else + { + mpn_sub_n (asm1, a0, a1, n); + } + } + else /* n - s == 1 */ + { + if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0) + { + mpn_sub_n (asm1, a1, a0, s); + asm1[s] = 0; + } + else + { + asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s); + } + } + +#define v0 pp /* 2n */ +#define vinf (pp + 2 * n) /* s+s */ +#define vm1 scratch /* 2n */ +#define scratch_out scratch + 2 * n + + /* vm1, 2n limbs */ + TOOM2_SQR_REC (vm1, asm1, n, scratch_out); + + /* vinf, s+s limbs */ + TOOM2_SQR_REC (vinf, a1, s, scratch_out); + + /* v0, 2n limbs */ + TOOM2_SQR_REC (v0, ap, n, scratch_out); + + /* H(v0) + L(vinf) */ + cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); + + /* L(v0) + H(v0) */ + cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); + + /* L(vinf) + H(vinf) */ + cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + s - n); + + cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); + + ASSERT (cy + 1 <= 3); + ASSERT (cy2 <= 2); + + if (LIKELY (cy <= 2)) { + MPN_INCR_U (pp + 2 * n, s + s, cy2); + MPN_INCR_U (pp + 3 * n, s + s - n, cy); + } else { /* cy is negative */ + /* The total contribution of v0+vinf-vm1 can not be negative. */ +#if WANT_ASSERT + /* The borrow in cy stops the propagation of the carry cy2, */ + ASSERT (cy2 == 1); + cy += mpn_add_1 (pp + 2 * n, pp + 2 * n, n, cy2); + ASSERT (cy == 0); +#else + /* we simply fill the area with zeros. */ + MPN_FILL (pp + 2 * n, n, 0); +#endif + } +} diff --git a/gmp-6.3.0/mpn/generic/toom32_mul.c b/gmp-6.3.0/mpn/generic/toom32_mul.c new file mode 100644 index 0000000..1139d17 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom32_mul.c @@ -0,0 +1,320 @@ +/* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5 + times as large as bn. Or more accurately, bn < an < 3bn. + + Contributed to the GNU project by Torbjorn Granlund. + Improvements by Marco Bodrato and Niels Möller. + + The idea of applying Toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2020, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +1, +inf + + <-s-><--n--><--n--> + ___ ______ ______ + |a2_|___a1_|___a0_| + |_b1_|___b0_| + <-t--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2)*(b0+ b1) # A(1)*B(1) ah <= 2 bh <= 1 + vm1 = (a0- a1+ a2)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0 + vinf= a2 * b1 # A(inf)*B(inf) +*/ + +#define TOOM32_MUL_N_REC(p, a, b, n, ws) \ + do { \ + mpn_mul_n (p, a, b, n); \ + } while (0) + +void +mpn_toom32_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + int vm1_neg; + mp_limb_t cy; + mp_limb_signed_t hi; + mp_limb_t ap1_hi, bp1_hi; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2 * n) +#define b0 bp +#define b1 (bp + n) + + /* Required, to ensure that s + t >= n. */ + ASSERT (bn + 2 <= an && an + 6 <= 3*bn); + + n = 2 * an >= 3 * bn ? (an + 2) / (size_t) 3 : (bn + 1) >> 1; + + s = an - 2 * n; + t = bn - n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + ASSERT (s + t >= n); + + /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */ +#define ap1 (pp) /* n, most significant limb in ap1_hi */ +#define bp1 (pp + n) /* n, most significant bit in bp1_hi */ +#define am1 (pp + 2*n) /* n, most significant bit in hi */ +#define bm1 (pp + 3*n) /* n */ +#define v1 (scratch) /* 2n + 1 */ +#define vm1 (pp) /* 2n + 1 */ +#define scratch_out (scratch + 2*n + 1) /* Currently unused. */ + + /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */ + + /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */ + + /* Compute ap1 = a0 + a1 + a2, am1 = a0 - a1 + a2 */ + ap1_hi = mpn_add (ap1, a0, n, a2, s); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0) + { + ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1; + hi = 0; + vm1_neg = 1; + } + else + { + cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n); + hi = ap1_hi - (cy & 1); + ap1_hi += (cy >> 1); + vm1_neg = 0; + } +#else + if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n)); + hi = 0; + vm1_neg = 1; + } + else + { + hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n); + vm1_neg = 0; + } + ap1_hi += mpn_add_n (ap1, ap1, a1, n); +#endif + + /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */ + if (t == n) + { +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (b0, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n); + vm1_neg ^= 1; + } + else + { + cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n); + } + bp1_hi = cy >> 1; +#else + bp1_hi = mpn_add_n (bp1, b0, b1, n); + + if (mpn_cmp (b0, b1, n) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n)); + vm1_neg ^= 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n)); + } +#endif + } + else + { + /* FIXME: Should still use mpn_add_n_sub_n for the main part. */ + bp1_hi = mpn_add (bp1, b0, n, b1, t); + + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t)); + MPN_ZERO (bm1 + t, n - t); + vm1_neg ^= 1; + } + else + { + ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t)); + } + } + + TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out); + if (ap1_hi == 1) + { + cy = mpn_add_n (v1 + n, v1 + n, bp1, n); + } + else if (ap1_hi > 1) /* ap1_hi == 2 */ + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = mpn_addlsh1_n_ip1 (v1 + n, bp1, n); +#else + cy = mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2)); +#endif + } + else + cy = 0; + if (bp1_hi != 0) + cy += ap1_hi + mpn_add_n (v1 + n, v1 + n, ap1, n); + v1[2 * n] = cy; + + TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out); + if (hi) + hi = mpn_add_n (vm1+n, vm1+n, bm1, n); + + vm1[2*n] = hi; + + /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */ + if (vm1_neg) + { +#if HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (v1, v1, vm1, 2*n+1); +#else + mpn_sub_n (v1, v1, vm1, 2*n+1); + ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1)); +#endif + } + else + { +#if HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (v1, v1, vm1, 2*n+1); +#else + mpn_add_n (v1, v1, vm1, 2*n+1); + ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1)); +#endif + } + + /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence + + y = x1 + x3 + (x0 + x2) * B + = (x0 + x2) * B + (x0 + x2) - vm1. + + y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as + follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n + (already in place, except for carry propagation). + + We thus add + + B^3 B^2 B 1 + | | | | + +-----+----+ + + | x0 + x2 | + +----+-----+----+ + + | x0 + x2 | + +----------+ + - | vm1 | + --+----++----+----+- + | y2 | y1 | y0 | + +-----+----+----+ + + Since we store y0 at the same location as the low half of x0 + x2, we + need to do the middle sum first. */ + + hi = vm1[2*n]; + cy = mpn_add_n (pp + 2*n, v1, v1 + n, n); + MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]); + + /* FIXME: Can we get rid of this second vm1_neg conditional by + swapping the location of +1 and -1 values? */ + if (vm1_neg) + { + cy = mpn_add_n (v1, v1, vm1, n); + hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy); + MPN_INCR_U (v1 + n, n+1, hi); + } + else + { + cy = mpn_sub_n (v1, v1, vm1, n); + hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy); + MPN_DECR_U (v1 + n, n+1, hi); + } + + TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out); + /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */ + if (s > t) mpn_mul (pp+3*n, a2, s, b1, t); + else mpn_mul (pp+3*n, b1, t, a2, s); + + /* Remaining interpolation. + + y * B + x0 + x3 B^3 - x0 B^2 - x3 B + = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B + = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B + + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2 + = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2 + + (y2 - (H x0 - L x3)) B^3 + H x3 B^4 + + B^4 B^3 B^2 B 1 + | | | | | | + +-------+ +---------+---------+ + | Hx3 | | Hx0-Lx3 | Lx0 | + +------+----------+---------+---------+---------+ + | y2 | y1 | y0 | + ++---------+---------+---------+ + -| Hx0-Lx3 | - Lx0 | + +---------+---------+ + | - Hx3 | + +--------+ + + We must take into account the carry from Hx0 - Lx3. + */ + + cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n); + hi = scratch[2*n] + cy; + + cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy); + hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy); + + hi += mpn_add (pp + n, pp + n, 3*n, scratch, n); + + /* FIXME: Is support for s + t == n needed? */ + if (LIKELY (s + t > n)) + { + hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n); + + ASSERT (hi >= 0); /* contribution of the middle terms >= 0 */ + MPN_INCR_U (pp + 4*n, s+t-n, hi); + } + else + ASSERT (hi == 0); +} diff --git a/gmp-6.3.0/mpn/generic/toom33_mul.c b/gmp-6.3.0/mpn/generic/toom33_mul.c new file mode 100644 index 0000000..54f055f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom33_mul.c @@ -0,0 +1,316 @@ +/* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in + size. Or more accurately, bn <= an < (3/2)bn. + + Contributed to the GNU project by Torbjorn Granlund. + Additional improvements by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2010, 2012, 2015, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +1, +2, +inf + + <-s--><--n--><--n--> + ____ ______ ______ + |_a2_|___a1_|___a0_| + |b2_|___b1_|___b0_| + <-t-><--n--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2 + vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1 + v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6 + vinf= a2 * b2 # A(inf)*B(inf) +*/ + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_mul_basecase 1 +#define MAYBE_mul_toom33 1 +#else +#define MAYBE_mul_basecase \ + (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD) +#define MAYBE_mul_toom33 \ + (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD) +#endif + +/* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced + multiplication at the infinity point. We may have + MAYBE_mul_basecase == 0, and still get s just below + MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get + s == 1 and mpn_toom22_mul will crash. +*/ + +#define TOOM33_MUL_N_REC(p, a, b, n, ws) \ + do { \ + if (MAYBE_mul_basecase \ + && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ + mpn_mul_basecase (p, a, n, b, n); \ + else if (! MAYBE_mul_toom33 \ + || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + else \ + mpn_toom33_mul (p, a, n, b, n, ws); \ + } while (0) + +void +mpn_toom33_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + const int __gmpn_cpuvec_initialized = 1; + mp_size_t n, s, t; + int vm1_neg; + mp_limb_t cy, vinf0; + mp_ptr gp; + mp_ptr as1, asm1, as2; + mp_ptr bs1, bsm1, bs2; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define b0 bp +#define b1 (bp + n) +#define b2 (bp + 2*n) + + n = (an + 2) / (size_t) 3; + + s = an - 2 * n; + t = bn - 2 * n; + + ASSERT (an >= bn); + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + as1 = scratch + 4 * n + 4; + asm1 = scratch + 2 * n + 2; + as2 = pp + n + 1; + + bs1 = pp; + bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */ + bs2 = pp + 2 * n + 2; + + gp = scratch; + + vm1_neg = 0; + + /* Compute as1 and asm1. */ + cy = mpn_add (gp, a0, n, a2, s); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (cy == 0 && mpn_cmp (gp, a1, n) < 0) + { + cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); + as1[n] = cy >> 1; + asm1[n] = 0; + vm1_neg = 1; + } + else + { + mp_limb_t cy2; + cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); + as1[n] = cy + (cy2 >> 1); + asm1[n] = cy - (cy2 & 1); + } +#else + as1[n] = cy + mpn_add_n (as1, gp, a1, n); + if (cy == 0 && mpn_cmp (gp, a1, n) < 0) + { + mpn_sub_n (asm1, a1, gp, n); + asm1[n] = 0; + vm1_neg = 1; + } + else + { + cy -= mpn_sub_n (asm1, gp, a1, n); + asm1[n] = cy; + } +#endif + + /* Compute as2. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_add_n (as2, a2, as1, s); + if (s != n) + cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); + cy += as1[n]; + cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); +#else +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (as2, a1, a2, s); + if (s != n) + cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); + cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); +#else + cy = mpn_add_n (as2, a2, as1, s); + if (s != n) + cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); + cy += as1[n]; + cy = 2 * cy + mpn_lshift (as2, as2, n, 1); + cy -= mpn_sub_n (as2, as2, a0, n); +#endif +#endif + as2[n] = cy; + + /* Compute bs1 and bsm1. */ + cy = mpn_add (gp, b0, n, b2, t); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (cy == 0 && mpn_cmp (gp, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n); + bs1[n] = cy >> 1; + bsm1[n] = 0; + vm1_neg ^= 1; + } + else + { + mp_limb_t cy2; + cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n); + bs1[n] = cy + (cy2 >> 1); + bsm1[n] = cy - (cy2 & 1); + } +#else + bs1[n] = cy + mpn_add_n (bs1, gp, b1, n); + if (cy == 0 && mpn_cmp (gp, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, gp, n); + bsm1[n] = 0; + vm1_neg ^= 1; + } + else + { + cy -= mpn_sub_n (bsm1, gp, b1, n); + bsm1[n] = cy; + } +#endif + + /* Compute bs2. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_add_n (bs2, b2, bs1, t); + if (t != n) + cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); + cy += bs1[n]; + cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n); +#else +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bs2, b1, b2, t); + if (t != n) + cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy); + cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n); +#else + cy = mpn_add_n (bs2, bs1, b2, t); + if (t != n) + cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); + cy += bs1[n]; + cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1); + cy -= mpn_sub_n (bs2, bs2, b0, n); +#endif +#endif + bs2[n] = cy; + + ASSERT (as1[n] <= 2); + ASSERT (bs1[n] <= 2); + ASSERT (asm1[n] <= 1); + ASSERT (bsm1[n] <= 1); + ASSERT (as2[n] <= 6); + ASSERT (bs2[n] <= 6); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 4 * n) /* s+t */ +#define vm1 scratch /* 2n+1 */ +#define v2 (scratch + 2 * n + 1) /* 2n+2 */ +#define scratch_out (scratch + 5 * n + 5) + + /* vm1, 2n+1 limbs */ +#ifdef SMALLER_RECURSION + TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); + cy = 0; + if (asm1[n] != 0) + cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); + if (bsm1[n] != 0) + cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); + vm1[2 * n] = cy; +#else + vm1[2 * n] = 0; + TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + (bsm1[n] | asm1[n]), scratch_out); +#endif + + TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ + + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a2, s, b2, t); + else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out); + + vinf0 = vinf[0]; /* v1 overlaps with this */ + +#ifdef SMALLER_RECURSION + /* v1, 2n+1 limbs */ + TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out); + if (as1[n] == 1) + { + cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); + } + else if (as1[n] != 0) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n); +#else + cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); +#endif + } + else + cy = 0; + if (bs1[n] == 1) + { + cy += mpn_add_n (v1 + n, v1 + n, as1, n); + } + else if (bs1[n] != 0) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); +#else + cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); +#endif + } + v1[2 * n] = cy; +#else + cy = vinf[1]; + TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out); + vinf[1] = cy; +#endif + + TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ + + mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); +} diff --git a/gmp-6.3.0/mpn/generic/toom3_sqr.c b/gmp-6.3.0/mpn/generic/toom3_sqr.c new file mode 100644 index 0000000..297a27f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom3_sqr.c @@ -0,0 +1,221 @@ +/* mpn_toom3_sqr -- Square {ap,an}. + + Contributed to the GNU project by Torbjorn Granlund. + Additional improvements by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2012, 2015, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +1, +2, +inf + + <-s--><--n--><--n--> + ____ ______ ______ + |_a2_|___a1_|___a0_| + + v0 = a0 ^2 # A(0)^2 + v1 = (a0+ a1+ a2)^2 # A(1)^2 ah <= 2 + vm1 = (a0- a1+ a2)^2 # A(-1)^2 |ah| <= 1 + v2 = (a0+2a1+4a2)^2 # A(2)^2 ah <= 6 + vinf= a2 ^2 # A(inf)^2 +*/ + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_sqr_basecase 1 +#define MAYBE_sqr_toom3 1 +#else +#define MAYBE_sqr_basecase \ + (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_toom3 \ + (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD) +#endif + +#define TOOM3_SQR_REC(p, a, n, ws) \ + do { \ + if (MAYBE_sqr_basecase \ + && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \ + mpn_sqr_basecase (p, a, n); \ + else if (! MAYBE_sqr_toom3 \ + || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ + mpn_toom2_sqr (p, a, n, ws); \ + else \ + mpn_toom3_sqr (p, a, n, ws); \ + } while (0) + +void +mpn_toom3_sqr (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_ptr scratch) +{ + const int __gmpn_cpuvec_initialized = 1; + mp_size_t n, s; + mp_limb_t cy, vinf0; + mp_ptr gp; + mp_ptr as1, asm1, as2; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) + + n = (an + 2) / (size_t) 3; + + s = an - 2 * n; + + ASSERT (0 < s && s <= n); + + as1 = scratch + 4 * n + 4; + asm1 = scratch + 2 * n + 2; + as2 = pp + n + 1; + + gp = scratch; + + /* Compute as1 and asm1. */ + cy = mpn_add (gp, a0, n, a2, s); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (cy == 0 && mpn_cmp (gp, a1, n) < 0) + { + cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); + as1[n] = cy >> 1; + asm1[n] = 0; + } + else + { + mp_limb_t cy2; + cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); + as1[n] = cy + (cy2 >> 1); + asm1[n] = cy - (cy2 & 1); + } +#else + as1[n] = cy + mpn_add_n (as1, gp, a1, n); + if (cy == 0 && mpn_cmp (gp, a1, n) < 0) + { + mpn_sub_n (asm1, a1, gp, n); + asm1[n] = 0; + } + else + { + cy -= mpn_sub_n (asm1, gp, a1, n); + asm1[n] = cy; + } +#endif + + /* Compute as2. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_add_n (as2, a2, as1, s); + if (s != n) + cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); + cy += as1[n]; + cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); +#else +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (as2, a1, a2, s); + if (s != n) + cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); + cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); +#else + cy = mpn_add_n (as2, a2, as1, s); + if (s != n) + cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); + cy += as1[n]; + cy = 2 * cy + mpn_lshift (as2, as2, n, 1); + cy -= mpn_sub_n (as2, as2, a0, n); +#endif +#endif + as2[n] = cy; + + ASSERT (as1[n] <= 2); + ASSERT (asm1[n] <= 1); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 4 * n) /* s+s */ +#define vm1 scratch /* 2n+1 */ +#define v2 (scratch + 2 * n + 1) /* 2n+2 */ +#define scratch_out (scratch + 5 * n + 5) + + /* vm1, 2n+1 limbs */ +#ifdef SMALLER_RECURSION + TOOM3_SQR_REC (vm1, asm1, n, scratch_out); + cy = asm1[n]; + if (cy != 0) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy += mpn_addlsh1_n_ip1 (vm1 + n, asm1, n); +#else + cy += mpn_addmul_1 (vm1 + n, asm1, n, CNST_LIMB(2)); +#endif + } + vm1[2 * n] = cy; +#else + vm1[2 * n] = 0; + TOOM3_SQR_REC (vm1, asm1, n + asm1[n], scratch_out); +#endif + + TOOM3_SQR_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */ + + TOOM3_SQR_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */ + + vinf0 = vinf[0]; /* v1 overlaps with this */ + +#ifdef SMALLER_RECURSION + /* v1, 2n+1 limbs */ + TOOM3_SQR_REC (v1, as1, n, scratch_out); + cy = as1[n]; + if (cy == 1) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); +#else + cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); +#endif + } + else if (cy != 0) + { +#if HAVE_NATIVE_mpn_addlsh2_n_ip1 + cy = 4 + mpn_addlsh2_n_ip1 (v1 + n, as1, n); +#else + cy = 4 + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(4)); +#endif + } + v1[2 * n] = cy; +#else + cy = vinf[1]; + TOOM3_SQR_REC (v1, as1, n + 1, scratch_out); + vinf[1] = cy; +#endif + + TOOM3_SQR_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */ + + mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0); +} diff --git a/gmp-6.3.0/mpn/generic/toom42_mul.c b/gmp-6.3.0/mpn/generic/toom42_mul.c new file mode 100644 index 0000000..e84ce65 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom42_mul.c @@ -0,0 +1,234 @@ +/* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice + as large as bn. Or more accurately, (3/2)bn < an < 4bn. + + Contributed to the GNU project by Torbjorn Granlund. + Additional improvements by Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2012, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +1, +2, +inf + + <-s-><--n--><--n--><--n--> + ___ ______ ______ ______ + |a3_|___a2_|___a1_|___a0_| + |_b1_|___b0_| + <-t--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2+ a3)*(b0+ b1) # A(1)*B(1) ah <= 3 bh <= 1 + vm1 = (a0- a1+ a2- a3)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0 + v2 = (a0+2a1+4a2+8a3)*(b0+2b1) # A(2)*B(2) ah <= 14 bh <= 2 + vinf= a3 * b1 # A(inf)*B(inf) +*/ + +#define TOOM42_MUL_N_REC(p, a, b, n, ws) \ + do { \ + mpn_mul_n (p, a, b, n); \ + } while (0) + +void +mpn_toom42_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + int vm1_neg; + mp_limb_t cy, vinf0; + mp_ptr a0_a2; + mp_ptr as1, asm1, as2; + mp_ptr bs1, bsm1, bs2; + mp_ptr tmp; + TMP_DECL; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) +#define b0 bp +#define b1 (bp + n) + + n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1; + + s = an - 3 * n; + t = bn - n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + TMP_MARK; + + tmp = TMP_ALLOC_LIMBS (6 * n + 5); + as1 = tmp; tmp += n + 1; + asm1 = tmp; tmp += n + 1; + as2 = tmp; tmp += n + 1; + bs1 = tmp; tmp += n + 1; + bsm1 = tmp; tmp += n; + bs2 = tmp; tmp += n + 1; + + a0_a2 = pp; + + /* Compute as1 and asm1. */ + vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2) & 1; + + /* Compute as2. */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (as2, a2, a3, s); + if (s != n) + cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy); + cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); + cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); +#else + cy = mpn_lshift (as2, a3, s, 1); + cy += mpn_add_n (as2, a2, as2, s); + if (s != n) + cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy); + cy = 2 * cy + mpn_lshift (as2, as2, n, 1); + cy += mpn_add_n (as2, a1, as2, n); + cy = 2 * cy + mpn_lshift (as2, as2, n, 1); + cy += mpn_add_n (as2, a0, as2, n); +#endif + as2[n] = cy; + + /* Compute bs1 and bsm1. */ + if (t == n) + { +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (b0, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); + vm1_neg ^= 1; + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); + } + bs1[n] = cy >> 1; +#else + bs1[n] = mpn_add_n (bs1, b0, b1, n); + + if (mpn_cmp (b0, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, b0, n); + vm1_neg ^= 1; + } + else + { + mpn_sub_n (bsm1, b0, b1, n); + } +#endif + } + else + { + bs1[n] = mpn_add (bs1, b0, n, b1, t); + + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + mpn_sub_n (bsm1, b1, b0, t); + MPN_ZERO (bsm1 + t, n - t); + vm1_neg ^= 1; + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + } + } + + /* Compute bs2, recycling bs1. bs2=bs1+b1 */ + mpn_add (bs2, bs1, n + 1, b1, t); + + ASSERT (as1[n] <= 3); + ASSERT (bs1[n] <= 1); + ASSERT (asm1[n] <= 1); +/*ASSERT (bsm1[n] == 0);*/ + ASSERT (as2[n] <= 14); + ASSERT (bs2[n] <= 2); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 4 * n) /* s+t */ +#define vm1 scratch /* 2n+1 */ +#define v2 (scratch + 2 * n + 1) /* 2n+2 */ +#define scratch_out scratch + 4 * n + 4 /* Currently unused. */ + + /* vm1, 2n+1 limbs */ + TOOM42_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); + cy = 0; + if (asm1[n] != 0) + cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); + vm1[2 * n] = cy; + + TOOM42_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ + + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a3, s, b1, t); + else mpn_mul (vinf, b1, t, a3, s); + + vinf0 = vinf[0]; /* v1 overlaps with this */ + + /* v1, 2n+1 limbs */ + TOOM42_MUL_N_REC (v1, as1, bs1, n, scratch_out); + if (as1[n] == 1) + { + cy = mpn_add_n (v1 + n, v1 + n, bs1, n); + } + else if (as1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = mpn_addlsh1_n_ip1 (v1 + n, bs1, n); +#else + cy = mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); +#endif + } + else if (as1[n] == 3) + { + cy = mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3)); + } + else + cy = 0; + if (bs1[n] != 0) + cy += as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n); + v1[2 * n] = cy; + + TOOM42_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ + + mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/toom42_mulmid.c b/gmp-6.3.0/mpn/generic/toom42_mulmid.c new file mode 100644 index 0000000..f581b10 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom42_mulmid.c @@ -0,0 +1,237 @@ +/* mpn_toom42_mulmid -- toom42 middle product + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + + +/* + Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}. + + Neither ap nor bp may overlap rp. + + Must have n >= 4. + + Amount of scratch space required is given by mpn_toom42_mulmid_itch(). + + FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact + requirements should be clarified. +*/ +void +mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, + mp_ptr scratch) +{ + mp_limb_t cy, e[12], zh, zl; + mp_size_t m; + int neg; + + ASSERT (n >= 4); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); + + ap += n & 1; /* handle odd row and diagonal later */ + m = n / 2; + + /* (e0h:e0l) etc are correction terms, in 2's complement */ +#define e0l (e[0]) +#define e0h (e[1]) +#define e1l (e[2]) +#define e1h (e[3]) +#define e2l (e[4]) +#define e2h (e[5]) +#define e3l (e[6]) +#define e3h (e[7]) +#define e4l (e[8]) +#define e4h (e[9]) +#define e5l (e[10]) +#define e5h (e[11]) + +#define s (scratch + 2) +#define t (rp + m + 2) +#define p0 rp +#define p1 scratch +#define p2 (rp + m) +#define next_scratch (scratch + 3*m + 1) + + /* + rp scratch + |---------|-----------| |---------|---------|----------| + 0 m 2m+2 0 m 2m 3m+1 + <----p2----> <-------------s-------------> + <----p0----><---t----> <----p1----> + */ + + /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */ + cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0); + cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l, + bp + m, bp, m, cy); + mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy); + + /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */ + if (mpn_cmp (bp + m, bp, m) < 0) + { + ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l, + ap + m - 1, ap + 2*m - 1, m, 0)); + neg = 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l, + ap + m - 1, ap + 2*m - 1, m, 0)); + neg = 0; + } + + /* recursive middle products. The picture is: + + b[2m-1] A A A B B B - - - - - + ... - A A A B B B - - - - + b[m] - - A A A B B B - - - + b[m-1] - - - C C C D D D - - + ... - - - - C C C D D D - + b[0] - - - - - C C C D D D + a[0] ... a[m] ... a[2m] ... a[4m-2] + */ + + if (m < MULMID_TOOM42_THRESHOLD) + { + /* A + B */ + mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m); + /* accumulate high limbs of p0 into e1 */ + ADDC_LIMB (cy, e1l, e1l, p0[m]); + e1h += p0[m + 1] + cy; + /* (-1)^neg * (B - C) (overwrites first m limbs of s) */ + mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m); + /* C + D (overwrites t) */ + mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m); + } + else + { + /* as above, but use toom42 instead */ + mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch); + ADDC_LIMB (cy, e1l, e1l, p0[m]); + e1h += p0[m + 1] + cy; + mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch); + mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch); + } + + /* apply error terms */ + + /* -e0 at rp[0] */ + SUBC_LIMB (cy, rp[0], rp[0], e0l); + SUBC_LIMB (cy, rp[1], rp[1], e0h + cy); + if (UNLIKELY (cy)) + { + cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1; + SUBC_LIMB (cy, e1l, e1l, cy); + e1h -= cy; + } + + /* z = e1 - e2 + high(p0) */ + SUBC_LIMB (cy, zl, e1l, e2l); + zh = e1h - e2h - cy; + + /* z at rp[m] */ + ADDC_LIMB (cy, rp[m], rp[m], zl); + zh = (zh + cy) & GMP_NUMB_MASK; + ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh); + cy -= (zh >> (GMP_NUMB_BITS - 1)); + if (UNLIKELY (cy)) + { + if (cy == 1) + mpn_add_1 (rp + m + 2, rp + m + 2, m, 1); + else /* cy == -1 */ + mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1); + } + + /* e3 at rp[2*m] */ + ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l); + rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK; + + /* e4 at p1[0] */ + ADDC_LIMB (cy, p1[0], p1[0], e4l); + ADDC_LIMB (cy, p1[1], p1[1], e4h + cy); + if (UNLIKELY (cy)) + mpn_add_1 (p1 + 2, p1 + 2, m, 1); + + /* -e5 at p1[m] */ + SUBC_LIMB (cy, p1[m], p1[m], e5l); + p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK; + + /* adjustment if p1 ends up negative */ + cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1)); + + /* add (-1)^neg * (p1 - B^m * p1) to output */ + if (neg) + { + mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy); + mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ + mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */ + } + else + { + mpn_add_1 (rp + m + 2, rp + m + 2, m, cy); + mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ + mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */ + } + + /* odd row and diagonal */ + if (n & 1) + { + /* + Products marked E are already done. We need to do products marked O. + + OOOOO---- + -EEEEO--- + --EEEEO-- + ---EEEEO- + ----EEEEO + */ + + /* first row of O's */ + cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]); + ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy); + + /* O's on diagonal */ + /* FIXME: should probably define an interface "mpn_mulmid_diag_1" + that can handle the sum below. Currently we're relying on + mulmid_basecase being pretty fast for a diagonal sum like this, + which is true at least for the K8 asm version, but surely false + for the generic version. */ + mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1); + mpn_add_n (rp + n - 1, rp + n - 1, e, 3); + } +} diff --git a/gmp-6.3.0/mpn/generic/toom43_mul.c b/gmp-6.3.0/mpn/generic/toom43_mul.c new file mode 100644 index 0000000..34acd25 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom43_mul.c @@ -0,0 +1,238 @@ +/* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3 + times as large as bn. Or more accurately, bn < an < 2 bn. + + Contributed to the GNU project by Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -2, -1, 0, +1, +2, +inf + + <-s-><--n--><--n--><--n--> + ___ ______ ______ ______ + |a3_|___a2_|___a1_|___a0_| + |_b2_|___b1_|___b0_| + <-t--><--n--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2 + vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1 + v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6 + vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4 + vinf= a3 * b2 # A(inf)*B(inf) +*/ + +void +mpn_toom43_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + enum toom6_flags flags; + mp_limb_t cy; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2 * n) +#define a3 (ap + 3 * n) +#define b0 bp +#define b1 (bp + n) +#define b2 (bp + 2 * n) + + n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3); + + s = an - 3 * n; + t = bn - 2 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + /* This is true whenever an >= 25 or bn >= 19, I think. It + guarantees that we can fit 5 values of size n+1 in the product + area. */ + ASSERT (s+t >= 5); + +#define v0 pp /* 2n */ +#define vm1 (scratch) /* 2n+1 */ +#define v1 (pp + 2*n) /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define v2 (scratch + 4 * n + 2) /* 2n+1 */ +#define vinf (pp + 5 * n) /* s+t */ +#define bs1 pp /* n+1 */ +#define bsm1 (scratch + 2 * n + 2) /* n+1 */ +#define asm1 (scratch + 3 * n + 3) /* n+1 */ +#define asm2 (scratch + 4 * n + 4) /* n+1 */ +#define bsm2 (pp + n + 1) /* n+1 */ +#define bs2 (pp + 2 * n + 2) /* n+1 */ +#define as2 (pp + 3 * n + 3) /* n+1 */ +#define as1 (pp + 4 * n + 4) /* n+1 */ + + /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra + limb, because products will overwrite 2n+2 limbs. */ + +#define a0a2 scratch +#define b0b2 scratch +#define a1a3 asm1 +#define b1d bsm1 + + /* Compute as2 and asm2. */ + flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3)); + + /* Compute bs2 and bsm2. */ + b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */ +#if HAVE_NATIVE_mpn_addlsh2_n + cy = mpn_addlsh2_n (b0b2, b0, b2, t); /* 4b2 + b0 */ +#else + cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */ + cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */ +#endif + if (t != n) + cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy); + b0b2[n] = cy; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (b0b2, b1d, n+1) < 0) + { + mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1); + } +#else + mpn_add_n (bs2, b0b2, b1d, n+1); + if (mpn_cmp (b0b2, b1d, n+1) < 0) + { + mpn_sub_n (bsm2, b1d, b0b2, n+1); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + mpn_sub_n (bsm2, b0b2, b1d, n+1); + } +#endif + + /* Compute as1 and asm1. */ + flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2))); + + /* Compute bs1 and bsm1. */ + bsm1[n] = mpn_add (bsm1, b0, n, b2, t); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n); + bs1[n] = cy >> 1; + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n); + bs1[n] = bsm1[n] + (cy >> 1); + bsm1[n]-= cy & 1; + } +#else + bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n); + if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, bsm1, n); + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n); + } +#endif + + ASSERT (as1[n] <= 3); + ASSERT (bs1[n] <= 2); + ASSERT (asm1[n] <= 1); + ASSERT (bsm1[n] <= 1); + ASSERT (as2[n] <=14); + ASSERT (bs2[n] <= 6); + ASSERT (asm2[n] <= 9); + ASSERT (bsm2[n] <= 4); + + /* vm1, 2n+1 limbs */ + vm1[2*n] = 0; + mpn_mul_n (vm1, asm1, bsm1, n + (asm1[n] | bsm1[n])); /* W4 */ + + /* vm2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */ + + /* v2, 2n+1 limbs */ + mpn_mul_n (v2, as2, bs2, n+1); /* W1 */ + + /* v1, 2n+1 limbs */ + mpn_mul_n (v1, as1, bs1, n+1); /* W3 */ + + /* vinf, s+t limbs */ /* W0 */ + if (s > t) mpn_mul (vinf, a3, s, b2, t); + else mpn_mul (vinf, b2, t, a3, s); + + /* v0, 2n limbs */ + mpn_mul_n (v0, ap, bp, n); /* W5 */ + + mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s); + +#undef v0 +#undef vm1 +#undef v1 +#undef vm2 +#undef v2 +#undef vinf +#undef bs1 +#undef bs2 +#undef bsm1 +#undef bsm2 +#undef asm1 +#undef asm2 +/* #undef as1 */ +/* #undef as2 */ +#undef a0a2 +#undef b0b2 +#undef a1a3 +#undef b1d +#undef a0 +#undef a1 +#undef a2 +#undef a3 +#undef b0 +#undef b1 +#undef b2 +} diff --git a/gmp-6.3.0/mpn/generic/toom44_mul.c b/gmp-6.3.0/mpn/generic/toom44_mul.c new file mode 100644 index 0000000..a361899 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom44_mul.c @@ -0,0 +1,239 @@ +/* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in + size. Or more accurately, bn <= an < (4/3)bn. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf + + <-s--><--n--><--n--><--n--> + ____ ______ ______ ______ + |_a3_|___a2_|___a1_|___a0_| + |b3_|___b2_|___b1_|___b0_| + <-t-><--n--><--n--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3 + vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1 + v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14 + vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) # A(-2)*B(-2) |ah| <= 9 |bh| <= 9 + vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14 + vinf= a3 * b2 # A(inf)*B(inf) +*/ + +#if TUNE_PROGRAM_BUILD +#define MAYBE_mul_basecase 1 +#define MAYBE_mul_toom22 1 +#define MAYBE_mul_toom44 1 +#else +#define MAYBE_mul_basecase \ + (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD) +#define MAYBE_mul_toom22 \ + (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD) +#define MAYBE_mul_toom44 \ + (MUL_TOOM6H_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD) +#endif + +#define TOOM44_MUL_N_REC(p, a, b, n, ws) \ + do { \ + if (MAYBE_mul_basecase \ + && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ + mpn_mul_basecase (p, a, n, b, n); \ + else if (MAYBE_mul_toom22 \ + && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + else if (! MAYBE_mul_toom44 \ + || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) \ + mpn_toom33_mul (p, a, n, b, n, ws); \ + else \ + mpn_toom44_mul (p, a, n, b, n, ws); \ + } while (0) + +/* Use of scratch space. In the product area, we store + + ___________________ + |vinf|____|_v1_|_v0_| + s+t 2n-1 2n+1 2n + + The other recursive products, vm1, v2, vm2, vh are stored in the + scratch area. When computing them, we use the product area for + intermediate values. + + Next, we compute v1. We can store the intermediate factors at v0 + and at vh + 2n + 2. + + Finally, for v0 and vinf, factors are parts of the input operands, + and we need scratch space only for the recursive multiplication. + + In all, if S(an) is the scratch need, the needed space is bounded by + + S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1) + + which should give S(n) = 8 n/3 + c log(n) for some constant c. +*/ + +void +mpn_toom44_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + mp_limb_t cy; + enum toom7_flags flags; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) +#define b0 bp +#define b1 (bp + n) +#define b2 (bp + 2*n) +#define b3 (bp + 3*n) + + ASSERT (an >= bn); + + n = (an + 3) >> 2; + + s = an - 3 * n; + t = bn - 3 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + ASSERT (s >= t); + + /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the + * following limb, so these must be computed in order, and we need a + * one limb gap to tp. */ +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define tp (scratch + 8*n + 5) + + /* apx and bpx must not overlap with v1 */ +#define apx pp /* n+1 */ +#define amx (pp + n + 1) /* n+1 */ +#define bmx (pp + 2*n + 2) /* n+1 */ +#define bpx (pp + 4*n + 2) /* n+1 */ + + /* Total scratch need: 8*n + 5 + scratch for recursive calls. This + gives roughly 32 n/3 + log term. */ + + /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ + flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp)); + + /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */ + flags = (enum toom7_flags) (flags ^ (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp))); + + TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */ + TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */ + + /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (apx, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); + if (s < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (apx, a3, apx, s); + apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); + MPN_INCR_U (apx + s, n+1-s, cy2); + } + else + apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); +#else + cy = mpn_lshift (apx, a0, n, 1); + cy += mpn_add_n (apx, apx, a1, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + cy += mpn_add_n (apx, apx, a2, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + apx[n] = cy + mpn_add (apx, apx, n, a3, s); +#endif + + /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bpx, b1, b0, n); + cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n); + if (t < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (bpx, b3, bpx, t); + bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1); + MPN_INCR_U (bpx + t, n+1-t, cy2); + } + else + bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n); +#else + cy = mpn_lshift (bpx, b0, n, 1); + cy += mpn_add_n (bpx, bpx, b1, n); + cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); + cy += mpn_add_n (bpx, bpx, b2, n); + cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); + bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t); +#endif + + ASSERT (apx[n] < 15); + ASSERT (bpx[n] < 15); + + TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */ + + /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ + flags = (enum toom7_flags) (flags | (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp))); + + /* Compute bpx = b0 + b1 + b2 + b3 and bmx = b0 - b1 + b2 - b3. */ + flags = (enum toom7_flags) (flags ^ (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp))); + + ASSERT (amx[n] <= 1); + ASSERT (bmx[n] <= 1); + + vm1 [2 * n] = 0; + TOOM44_MUL_N_REC (vm1, amx, bmx, n + (bmx[n] | amx[n]), tp); /* vm1, 2n+1 limbs */ + /* Clobbers amx, bmx. */ + TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */ + + TOOM44_MUL_N_REC (v0, a0, b0, n, tp); + if (s > t) + mpn_mul (vinf, a3, s, b3, t); + else + TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */ + + mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp); +} diff --git a/gmp-6.3.0/mpn/generic/toom4_sqr.c b/gmp-6.3.0/mpn/generic/toom4_sqr.c new file mode 100644 index 0000000..fd59d1c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom4_sqr.c @@ -0,0 +1,164 @@ +/* mpn_toom4_sqr -- Square {ap,an}. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2013, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -2, -1, 0, +1/2, +1, +2, +inf + + <-s--><--n--><--n--><--n--> + ____ ______ ______ ______ + |_a3_|___a2_|___a1_|___a0_| + + v0 = a0 ^2 # A(0)^2 + v1 = ( a0+ a1+ a2+ a3)^2 # A(1)^2 ah <= 3 + vm1 = ( a0- a1+ a2- a3)^2 # A(-1)^2 |ah| <= 1 + v2 = ( a0+2a1+4a2+8a3)^2 # A(2)^2 ah <= 14 + vm2 = ( a0-2a1+4a2-8a3)^2 # A(-2)^2 -9<=ah<=4 + vh = (8a0+4a1+2a2+ a3)^2 # A(1/2)^2 ah <= 14 + vinf= a3 ^2 # A(inf)^2 +*/ + +#if TUNE_PROGRAM_BUILD +#define MAYBE_sqr_basecase 1 +#define MAYBE_sqr_toom2 1 +#define MAYBE_sqr_toom4 1 +#else +#define MAYBE_sqr_basecase \ + (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_toom2 \ + (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_toom4 \ + (SQR_TOOM6_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD) +#endif + +#define TOOM4_SQR_REC(p, a, n, ws) \ + do { \ + if (MAYBE_sqr_basecase \ + && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \ + mpn_sqr_basecase (p, a, n); \ + else if (MAYBE_sqr_toom2 \ + && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ + mpn_toom2_sqr (p, a, n, ws); \ + else if (! MAYBE_sqr_toom4 \ + || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) \ + mpn_toom3_sqr (p, a, n, ws); \ + else \ + mpn_toom4_sqr (p, a, n, ws); \ + } while (0) + +void +mpn_toom4_sqr (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_ptr scratch) +{ + mp_size_t n, s; + mp_limb_t cy; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) + + n = (an + 3) >> 2; + + s = an - 3 * n; + + ASSERT (0 < s && s <= n); + + /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the + * following limb, so these must be computed in order, and we need a + * one limb gap to tp. */ +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define tp (scratch + 8*n + 5) + + /* No overlap with v1 */ +#define apx pp /* n+1 */ +#define amx (pp + 4*n + 2) /* n+1 */ + + /* Total scratch need: 8*n + 5 + scratch for recursive calls. This + gives roughly 32 n/3 + log term. */ + + /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ + mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp); + + TOOM4_SQR_REC (v2, apx, n + 1, tp); /* v2, 2n+1 limbs */ + TOOM4_SQR_REC (vm2, amx, n + 1, tp); /* vm2, 2n+1 limbs */ + + /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (apx, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); + if (s < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (apx, a3, apx, s); + apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); + MPN_INCR_U (apx + s, n+1-s, cy2); + } + else + apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); +#else + cy = mpn_lshift (apx, a0, n, 1); + cy += mpn_add_n (apx, apx, a1, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + cy += mpn_add_n (apx, apx, a2, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + apx[n] = cy + mpn_add (apx, apx, n, a3, s); +#endif + + ASSERT (apx[n] < 15); + + TOOM4_SQR_REC (vh, apx, n + 1, tp); /* vh, 2n+1 limbs */ + + /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ + mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp); + + TOOM4_SQR_REC (v1, apx, n + 1, tp); /* v1, 2n+1 limbs */ + vm1 [2 * n] = 0; + TOOM4_SQR_REC (vm1, amx, n + amx[n], tp); /* vm1, 2n+1 limbs */ + + TOOM4_SQR_REC (v0, a0, n, tp); + TOOM4_SQR_REC (vinf, a3, s, tp); /* vinf, 2s limbs */ + + mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) 0, vm2, vm1, v2, vh, 2*s, tp); +} diff --git a/gmp-6.3.0/mpn/generic/toom52_mul.c b/gmp-6.3.0/mpn/generic/toom52_mul.c new file mode 100644 index 0000000..974059b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom52_mul.c @@ -0,0 +1,256 @@ +/* mpn_toom52_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3 + times as large as bn. Or more accurately, bn < an < 2 bn. + + Contributed to the GNU project by Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -2, -1, 0, +1, +2, +inf + + <-s-><--n--><--n--><--n--><--n--> + ___ ______ ______ ______ ______ + |a4_|___a3_|___a2_|___a1_|___a0_| + |b1|___b0_| + <--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2+ a3+ a4)*(b0+ b1) # A(1)*B(1) ah <= 4 bh <= 1 + vm1 = (a0- a1+ a2- a3+ a4)*(b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0 + v2 = (a0+2a1+4a2+8a3+16a4)*(b0+2b1) # A(2)*B(2) ah <= 30 bh <= 2 + vm2 = (a0-2a1+4a2-8a3+16a4)*(b0-2b1) # A(-2)*B(-2) |ah| <= 20 |bh|<= 1 + vinf= a4 * b1 # A(inf)*B(inf) + + Some slight optimization in evaluation are taken from the paper: + "Towards Optimal Toom-Cook Multiplication for Univariate and + Multivariate Polynomials in Characteristic 2 and 0." +*/ + +void +mpn_toom52_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + enum toom6_flags flags; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2 * n) +#define a3 (ap + 3 * n) +#define a4 (ap + 4 * n) +#define b0 bp +#define b1 (bp + n) + + n = 1 + (2 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) >> 1); + + s = an - 4 * n; + t = bn - n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + /* Ensures that 5 values of n+1 limbs each fits in the product area. + Borderline cases are an = 32, bn = 8, n = 7, and an = 36, bn = 9, + n = 8. */ + ASSERT (s+t >= 5); + +#define v0 pp /* 2n */ +#define vm1 (scratch) /* 2n+1 */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define v2 (scratch + 4 * n + 2) /* 2n+1 */ +#define vinf (pp + 5 * n) /* s+t */ +#define bs1 pp /* n+1 */ +#define bsm1 (scratch + 2 * n + 2) /* n */ +#define asm1 (scratch + 3 * n + 3) /* n+1 */ +#define asm2 (scratch + 4 * n + 4) /* n+1 */ +#define bsm2 (pp + n + 1) /* n+1 */ +#define bs2 (pp + 2 * n + 2) /* n+1 */ +#define as2 (pp + 3 * n + 3) /* n+1 */ +#define as1 (pp + 4 * n + 4) /* n+1 */ + + /* Scratch need is 6 * n + 3 + 1. We need one extra limb, because + products will overwrite 2n+2 limbs. */ + +#define a0a2 scratch +#define a1a3 asm1 + + /* Compute as2 and asm2. */ + flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, a1a3)); + + /* Compute bs1 and bsm1. */ + if (t == n) + { +#if HAVE_NATIVE_mpn_add_n_sub_n + mp_limb_t cy; + + if (mpn_cmp (b0, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); + } + bs1[n] = cy >> 1; +#else + bs1[n] = mpn_add_n (bs1, b0, b1, n); + if (mpn_cmp (b0, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, b0, n); + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + mpn_sub_n (bsm1, b0, b1, n); + } +#endif + } + else + { + bs1[n] = mpn_add (bs1, b0, n, b1, t); + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + mpn_sub_n (bsm1, b1, b0, t); + MPN_ZERO (bsm1 + t, n - t); + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + } + } + + /* Compute bs2 and bsm2, recycling bs1 and bsm1. bs2=bs1+b1; bsm2=bsm1-b1 */ + mpn_add (bs2, bs1, n+1, b1, t); + if (flags & toom6_vm1_neg) + { + bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + bsm2[n] = 0; + if (t == n) + { + if (mpn_cmp (bsm1, b1, n) < 0) + { + mpn_sub_n (bsm2, b1, bsm1, n); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + mpn_sub_n (bsm2, bsm1, b1, n); + } + } + else + { + if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0) + { + mpn_sub_n (bsm2, b1, bsm1, t); + MPN_ZERO (bsm2 + t, n - t); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + mpn_sub (bsm2, bsm1, n, b1, t); + } + } + } + + /* Compute as1 and asm1. */ + flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, a0a2))); + + ASSERT (as1[n] <= 4); + ASSERT (bs1[n] <= 1); + ASSERT (asm1[n] <= 2); +/* ASSERT (bsm1[n] <= 1); */ + ASSERT (as2[n] <=30); + ASSERT (bs2[n] <= 2); + ASSERT (asm2[n] <= 20); + ASSERT (bsm2[n] <= 1); + + /* vm1, 2n+1 limbs */ + mpn_mul (vm1, asm1, n+1, bsm1, n); /* W4 */ + + /* vm2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */ + + /* v2, 2n+1 limbs */ + mpn_mul_n (v2, as2, bs2, n+1); /* W1 */ + + /* v1, 2n+1 limbs */ + mpn_mul_n (v1, as1, bs1, n+1); /* W3 */ + + /* vinf, s+t limbs */ /* W0 */ + if (s > t) mpn_mul (vinf, a4, s, b1, t); + else mpn_mul (vinf, b1, t, a4, s); + + /* v0, 2n limbs */ + mpn_mul_n (v0, ap, bp, n); /* W5 */ + + mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s); + +#undef v0 +#undef vm1 +#undef v1 +#undef vm2 +#undef v2 +#undef vinf +#undef bs1 +#undef bs2 +#undef bsm1 +#undef bsm2 +#undef asm1 +#undef asm2 +#undef as1 +#undef as2 +#undef a0a2 +#undef b0b2 +#undef a1a3 +#undef a0 +#undef a1 +#undef a2 +#undef a3 +#undef b0 +#undef b1 +#undef b2 + +} diff --git a/gmp-6.3.0/mpn/generic/toom53_mul.c b/gmp-6.3.0/mpn/generic/toom53_mul.c new file mode 100644 index 0000000..c934297 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom53_mul.c @@ -0,0 +1,331 @@ +/* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3 + times as large as bn. Or more accurately, (4/3)bn < an < (5/2)bn. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2012, 2014, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf + + <-s-><--n--><--n--><--n--><--n--> + ___ ______ ______ ______ ______ + |a4_|___a3_|___a2_|___a1_|___a0_| + |__b2|___b1_|___b0_| + <-t--><--n--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = ( a0+ a1+ a2+ a3+ a4)*( b0+ b1+ b2) # A(1)*B(1) ah <= 4 bh <= 2 + vm1 = ( a0- a1+ a2- a3+ a4)*( b0- b1+ b2) # A(-1)*B(-1) |ah| <= 2 bh <= 1 + v2 = ( a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) # A(2)*B(2) ah <= 30 bh <= 6 + vm2 = ( a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) # A(2)*B(2) -9<=ah<=20 -1<=bh<=4 + vh = (16a0+8a1+4a2+2a3+ a4)*(4b0+2b1+ b2) # A(1/2)*B(1/2) ah <= 30 bh <= 6 + vinf= a4 * b2 # A(inf)*B(inf) +*/ + +void +mpn_toom53_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + mp_limb_t cy; + mp_ptr gp; + mp_ptr as1, asm1, as2, asm2, ash; + mp_ptr bs1, bsm1, bs2, bsm2, bsh; + mp_ptr tmp; + enum toom7_flags flags; + TMP_DECL; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) +#define a4 (ap + 4*n) +#define b0 bp +#define b1 (bp + n) +#define b2 (bp + 2*n) + + n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); + + s = an - 4 * n; + t = bn - 2 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + TMP_MARK; + + tmp = TMP_ALLOC_LIMBS (10 * (n + 1)); + as1 = tmp; tmp += n + 1; + asm1 = tmp; tmp += n + 1; + as2 = tmp; tmp += n + 1; + asm2 = tmp; tmp += n + 1; + ash = tmp; tmp += n + 1; + bs1 = tmp; tmp += n + 1; + bsm1 = tmp; tmp += n + 1; + bs2 = tmp; tmp += n + 1; + bsm2 = tmp; tmp += n + 1; + bsh = tmp; tmp += n + 1; + + gp = pp; + + /* Compute as1 and asm1. */ + flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp)); + + /* Compute as2 and asm2. */ + flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp))); + + /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4 + = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (ash, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); + if (s < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (ash, a4, ash, s); + ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); + MPN_INCR_U (ash + s, n+1-s, cy2); + } + else + ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); +#else + cy = mpn_lshift (ash, a0, n, 1); + cy += mpn_add_n (ash, ash, a1, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a2, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a3, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + ash[n] = cy + mpn_add (ash, ash, n, a4, s); +#endif + + /* Compute bs1 and bsm1. */ + bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ +#if HAVE_NATIVE_mpn_add_n_sub_n + if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) + { + bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1; + bsm1[n] = 0; + flags = (enum toom7_flags) (flags ^ toom7_w3_neg); + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n); + bsm1[n] = bs1[n] - (cy & 1); + bs1[n] += (cy >> 1); + } +#else + if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, bs1, n); + bsm1[n] = 0; + flags = (enum toom7_flags) (flags ^ toom7_w3_neg); + } + else + { + bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n); + } + bs1[n] += mpn_add_n (bs1, bs1, b1, n); /* b0+b1+b2 */ +#endif + + /* Compute bs2 and bsm2. */ +#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n +#if HAVE_NATIVE_mpn_addlsh2_n + cy = mpn_addlsh2_n (bs2, b0, b2, t); +#else /* HAVE_NATIVE_mpn_addlsh_n */ + cy = mpn_addlsh_n (bs2, b0, b2, t, 2); +#endif + if (t < n) + cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy); + bs2[n] = cy; +#else + cy = mpn_lshift (gp, b2, t, 2); + bs2[n] = mpn_add (bs2, b0, n, gp, t); + MPN_INCR_U (bs2 + t, n+1-t, cy); +#endif + + gp[n] = mpn_lshift (gp, b1, n, 1); + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (bs2, gp, n+1) < 0) + { + ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1)); + flags = (enum toom7_flags) (flags ^ toom7_w1_neg); + } + else + { + ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1)); + } +#else + if (mpn_cmp (bs2, gp, n+1) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1)); + flags = (enum toom7_flags) (flags ^ toom7_w1_neg); + } + else + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1)); + } + mpn_add_n (bs2, bs2, gp, n+1); +#endif + + /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2. */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bsh, b1, b0, n); + if (t < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (bsh, b2, bsh, t); + bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1); + MPN_INCR_U (bsh + t, n+1-t, cy2); + } + else + bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n); +#else + cy = mpn_lshift (bsh, b0, n, 1); + cy += mpn_add_n (bsh, bsh, b1, n); + cy = 2*cy + mpn_lshift (bsh, bsh, n, 1); + bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t); +#endif + + ASSERT (as1[n] <= 4); + ASSERT (bs1[n] <= 2); + ASSERT (asm1[n] <= 2); + ASSERT (bsm1[n] <= 1); + ASSERT (as2[n] <= 30); + ASSERT (bs2[n] <= 6); + ASSERT (asm2[n] <= 20); + ASSERT (bsm2[n] <= 4); + ASSERT (ash[n] <= 30); + ASSERT (bsh[n] <= 6); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ + /* Total scratch need: 10*n+5 */ + + /* Must be in allocation order, as they overwrite one limb beyond + * 2n+1. */ + mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ + mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ + + /* vm1, 2n+1 limbs */ +#ifdef SMALLER_RECURSION + mpn_mul_n (vm1, asm1, bsm1, n); + if (asm1[n] == 1) + { + cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); + } + else if (asm1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = 2 * bsm1[n] + mpn_addlsh1_n_ip1 (vm1 + n, bsm1, n); +#else + cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); +#endif + } + else + cy = 0; + if (bsm1[n] != 0) + cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); + vm1[2 * n] = cy; +#else /* SMALLER_RECURSION */ + vm1[2 * n] = 0; + mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0)); +#endif /* SMALLER_RECURSION */ + + /* v1, 2n+1 limbs */ +#ifdef SMALLER_RECURSION + mpn_mul_n (v1, as1, bs1, n); + if (as1[n] == 1) + { + cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); + } + else if (as1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n); +#else + cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); +#endif + } + else if (as1[n] != 0) + { + cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); + } + else + cy = 0; + if (bs1[n] == 1) + { + cy += mpn_add_n (v1 + n, v1 + n, as1, n); + } + else if (bs1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); +#else + cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); +#endif + } + v1[2 * n] = cy; +#else /* SMALLER_RECURSION */ + v1[2 * n] = 0; + mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0)); +#endif /* SMALLER_RECURSION */ + + mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ + + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a4, s, b2, t); + else mpn_mul (vinf, b2, t, a4, s); + + mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, + scratch_out); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/toom54_mul.c b/gmp-6.3.0/mpn/generic/toom54_mul.c new file mode 100644 index 0000000..343b02e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom54_mul.c @@ -0,0 +1,142 @@ +/* Implementation of the algorithm for Toom-Cook 4.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Toom-4.5, the splitting 5x4 unbalanced version. + Evaluate in: infinity, +4, -4, +2, -2, +1, -1, 0. + + <--s-><--n--><--n--><--n--><--n--> + ____ ______ ______ ______ ______ + |_a4_|__a3__|__a2__|__a1__|__a0__| + |b3_|__b2__|__b1__|__b0__| + <-t-><--n--><--n--><--n--> + +*/ +#define TOOM_54_MUL_N_REC(p, a, b, n, ws) \ + do { mpn_mul_n (p, a, b, n); \ + } while (0) + +#define TOOM_54_MUL_REC(p, a, na, b, nb, ws) \ + do { mpn_mul (p, a, na, b, nb); \ + } while (0) + +void +mpn_toom54_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + int sign; + + /***************************** decomposition *******************************/ +#define a4 (ap + 4 * n) +#define b3 (bp + 3 * n) + + ASSERT (an >= bn); + n = 1 + (4 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 4); + + s = an - 4 * n; + t = bn - 3 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + /* Required by mpn_toom_interpolate_8pts. */ + ASSERT ( s + t >= n ); + ASSERT ( s + t > 4); + ASSERT ( n > 2); + +#define r8 pp /* 2n */ +#define r7 scratch /* 3n+1 */ +#define r5 (pp + 3*n) /* 3n+1 */ +#define v0 (pp + 3*n) /* n+1 */ +#define v1 (pp + 4*n+1) /* n+1 */ +#define v2 (pp + 5*n+2) /* n+1 */ +#define v3 (pp + 6*n+3) /* n+1 */ +#define r3 (scratch + 3 * n + 1) /* 3n+1 */ +#define r1 (pp + 7*n) /* s+t <= 2*n */ +#define ws (scratch + 6 * n + 2) /* ??? */ + + /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may + need all of them, when DO_mpn_sublsh_n usea a scratch */ + /********************** evaluation and recursive calls *********************/ + /* $\pm4$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, 4, ap, n, s, 2, pp) + ^ mpn_toom_eval_pm2exp (v3, v1, 3, bp, n, t, 2, pp); + TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */ + TOOM_54_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */ + mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4); + + /* $\pm1$ */ + sign = mpn_toom_eval_pm1 (v2, v0, 4, ap, n, s, pp) + ^ mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp); + TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-1)*B(-1) */ + TOOM_54_MUL_N_REC(r7, v2, v3, n + 1, ws); /* A(1)*B(1) */ + mpn_toom_couple_handling (r7, 2*n+1, pp, sign, n, 0, 0); + + /* $\pm2$ */ + sign = mpn_toom_eval_pm2 (v2, v0, 4, ap, n, s, pp) + ^ mpn_toom_eval_dgr3_pm2 (v3, v1, bp, n, t, pp); + TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-2)*B(-2) */ + TOOM_54_MUL_N_REC(r5, v2, v3, n + 1, ws); /* A(+2)*B(+2) */ + mpn_toom_couple_handling (r5, 2*n+1, pp, sign, n, 1, 2); + + /* A(0)*B(0) */ + TOOM_54_MUL_N_REC(pp, ap, bp, n, ws); + + /* Infinity */ + if (s > t) { + TOOM_54_MUL_REC(r1, a4, s, b3, t, ws); + } else { + TOOM_54_MUL_REC(r1, b3, t, a4, s, ws); + }; + + mpn_toom_interpolate_8pts (pp, n, r3, r7, s + t, ws); + +#undef a4 +#undef b3 +#undef r1 +#undef r3 +#undef r5 +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef r7 +#undef r8 +#undef ws +} diff --git a/gmp-6.3.0/mpn/generic/toom62_mul.c b/gmp-6.3.0/mpn/generic/toom62_mul.c new file mode 100644 index 0000000..d971cc0 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom62_mul.c @@ -0,0 +1,310 @@ +/* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times + as large as bn. Or more accurately, (5/2)bn < an < 6bn. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: + 0, +1, -1, +2, -2, 1/2, +inf + + <-s-><--n--><--n--><--n--><--n--><--n--> + ___ ______ ______ ______ ______ ______ + |a5_|___a4_|___a3_|___a2_|___a1_|___a0_| + |_b1_|___b0_| + <-t--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1 + vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0 + v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2 + vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0 + vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2 + vinf= a5 * b1 # A(inf)*B(inf) +*/ + +void +mpn_toom62_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + mp_limb_t cy; + mp_ptr as1, asm1, as2, asm2, ash; + mp_ptr bs1, bsm1, bs2, bsm2, bsh; + mp_ptr gp; + enum toom7_flags aflags, bflags; + TMP_DECL; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) +#define a4 (ap + 4*n) +#define a5 (ap + 5*n) +#define b0 bp +#define b1 (bp + n) + + n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1); + + s = an - 5 * n; + t = bn - n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + TMP_MARK; + + as1 = TMP_SALLOC_LIMBS (n + 1); + asm1 = TMP_SALLOC_LIMBS (n + 1); + as2 = TMP_SALLOC_LIMBS (n + 1); + asm2 = TMP_SALLOC_LIMBS (n + 1); + ash = TMP_SALLOC_LIMBS (n + 1); + + bs1 = TMP_SALLOC_LIMBS (n + 1); + bsm1 = TMP_SALLOC_LIMBS (n); + bs2 = TMP_SALLOC_LIMBS (n + 1); + bsm2 = TMP_SALLOC_LIMBS (n + 1); + bsh = TMP_SALLOC_LIMBS (n + 1); + + gp = pp; + + /* Compute as1 and asm1. */ + aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp)); + + /* Compute as2 and asm2. */ + aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp))); + + /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5 + = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */ + +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (ash, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); + if (s < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (ash, a5, ash, s); + ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); + MPN_INCR_U (ash + s, n+1-s, cy2); + } + else + ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n); +#else + cy = mpn_lshift (ash, a0, n, 1); + cy += mpn_add_n (ash, ash, a1, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a2, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a3, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a4, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + ash[n] = cy + mpn_add (ash, ash, n, a5, s); +#endif + + /* Compute bs1 and bsm1. */ + if (t == n) + { +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (b0, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); + bflags = toom7_w3_neg; + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); + bflags = (enum toom7_flags) 0; + } + bs1[n] = cy >> 1; +#else + bs1[n] = mpn_add_n (bs1, b0, b1, n); + if (mpn_cmp (b0, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, b0, n); + bflags = toom7_w3_neg; + } + else + { + mpn_sub_n (bsm1, b0, b1, n); + bflags = (enum toom7_flags) 0; + } +#endif + } + else + { + bs1[n] = mpn_add (bs1, b0, n, b1, t); + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + mpn_sub_n (bsm1, b1, b0, t); + MPN_ZERO (bsm1 + t, n - t); + bflags = toom7_w3_neg; + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + bflags = (enum toom7_flags) 0; + } + } + + /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 = + bsm1 - b1 */ + mpn_add (bs2, bs1, n + 1, b1, t); + if (bflags & toom7_w3_neg) + { + bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t); + bflags = (enum toom7_flags) (bflags | toom7_w1_neg); + } + else + { + /* FIXME: Simplify this logic? */ + if (t < n) + { + if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t)); + MPN_ZERO (bsm2 + t, n + 1 - t); + bflags = (enum toom7_flags) (bflags | toom7_w1_neg); + } + else + { + ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t)); + bsm2[n] = 0; + } + } + else + { + if (mpn_cmp (bsm1, b1, n) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n)); + bflags = (enum toom7_flags) (bflags | toom7_w1_neg); + } + else + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n)); + } + bsm2[n] = 0; + } + } + + /* Compute bsh, recycling bs1. bsh=bs1+b0; */ + bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n); + + ASSERT (as1[n] <= 5); + ASSERT (bs1[n] <= 1); + ASSERT (asm1[n] <= 2); + ASSERT (as2[n] <= 62); + ASSERT (bs2[n] <= 2); + ASSERT (asm2[n] <= 41); + ASSERT (bsm2[n] <= 1); + ASSERT (ash[n] <= 62); + ASSERT (bsh[n] <= 2); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ + /* Total scratch need: 10*n+5 */ + + /* Must be in allocation order, as they overwrite one limb beyond + * 2n+1. */ + mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ + mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ + + /* vm1, 2n+1 limbs */ + mpn_mul_n (vm1, asm1, bsm1, n); + cy = 0; + if (asm1[n] == 1) + { + cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); + } + else if (asm1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n); +#else + cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); +#endif + } + vm1[2 * n] = cy; + + /* v1, 2n+1 limbs */ + mpn_mul_n (v1, as1, bs1, n); + if (as1[n] == 1) + { + cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); + } + else if (as1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n + cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); +#else + cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); +#endif + } + else if (as1[n] != 0) + { + cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); + } + else + cy = 0; + if (bs1[n] != 0) + cy += mpn_add_n (v1 + n, v1 + n, as1, n); + v1[2 * n] = cy; + + mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ + + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a5, s, b1, t); + else mpn_mul (vinf, b1, t, a5, s); + + mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags), + vm2, vm1, v2, vh, s + t, scratch_out); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/toom63_mul.c b/gmp-6.3.0/mpn/generic/toom63_mul.c new file mode 100644 index 0000000..181996d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom63_mul.c @@ -0,0 +1,231 @@ +/* Implementation of the algorithm for Toom-Cook 4.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Stores |{ap,n}-{bp,n}| in {rp,n}, returns the sign. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + mp_limb_t x, y; + while (--n >= 0) + { + x = ap[n]; + y = bp[n]; + if (x != y) + { + n++; + if (x > y) + { + mpn_sub_n (rp, ap, bp, n); + return 0; + } + else + { + mpn_sub_n (rp, bp, ap, n); + return ~0; + } + } + rp[n] = 0; + } + return 0; +} + +static int +abs_sub_add_n (mp_ptr rm, mp_ptr rp, mp_srcptr rs, mp_size_t n) { + int result; + result = abs_sub_n (rm, rp, rs, n); + ASSERT_NOCARRY(mpn_add_n (rp, rp, rs, n)); + return result; +} + + +/* Toom-4.5, the splitting 6x3 unbalanced version. + Evaluate in: infinity, +4, -4, +2, -2, +1, -1, 0. + + <--s-><--n--><--n--><--n--><--n--><--n--> + ____ ______ ______ ______ ______ ______ + |_a5_|__a4__|__a3__|__a2__|__a1__|__a0__| + |b2_|__b1__|__b0__| + <-t-><--n--><--n--> + +*/ +#define TOOM_63_MUL_N_REC(p, a, b, n, ws) \ + do { mpn_mul_n (p, a, b, n); \ + } while (0) + +#define TOOM_63_MUL_REC(p, a, na, b, nb, ws) \ + do { mpn_mul (p, a, na, b, nb); \ + } while (0) + +void +mpn_toom63_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + mp_limb_t cy; + int sign; + + /***************************** decomposition *******************************/ +#define a5 (ap + 5 * n) +#define b0 (bp + 0 * n) +#define b1 (bp + 1 * n) +#define b2 (bp + 2 * n) + + ASSERT (an >= bn); + n = 1 + (an >= 2 * bn ? (an - 1) / (size_t) 6 : (bn - 1) / (size_t) 3); + + s = an - 5 * n; + t = bn - 2 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + /* WARNING! it assumes s+t>=n */ + ASSERT ( s + t >= n ); + ASSERT ( s + t > 4); + /* WARNING! it assumes n>1 */ + ASSERT ( n > 2); + +#define r8 pp /* 2n */ +#define r7 scratch /* 3n+1 */ +#define r5 (pp + 3*n) /* 3n+1 */ +#define v0 (pp + 3*n) /* n+1 */ +#define v1 (pp + 4*n+1) /* n+1 */ +#define v2 (pp + 5*n+2) /* n+1 */ +#define v3 (pp + 6*n+3) /* n+1 */ +#define r3 (scratch + 3 * n + 1) /* 3n+1 */ +#define r1 (pp + 7*n) /* s+t <= 2*n */ +#define ws (scratch + 6 * n + 2) /* ??? */ + + /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may + need all of them, when DO_mpn_sublsh_n usea a scratch */ +/* if (scratch == NULL) scratch = TMP_SALLOC_LIMBS (9 * n + 3); */ + + /********************** evaluation and recursive calls *********************/ + /* $\pm4$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp); + pp[n] = mpn_lshift (pp, b1, n, 2); /* 4b1 */ + /* FIXME: use addlsh */ + v3[t] = mpn_lshift (v3, b2, t, 4);/* 16b2 */ + if ( n == t ) + v3[n]+= mpn_add_n (v3, v3, b0, n); /* 16b2+b0 */ + else + v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 16b2+b0 */ + sign ^= abs_sub_add_n (v1, v3, pp, n + 1); + TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */ + TOOM_63_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */ + mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4); + + /* $\pm1$ */ + sign = mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s, pp); + /* Compute bs1 and bsm1. Code taken from toom33 */ + cy = mpn_add (ws, b0, n, b2, t); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (cy == 0 && mpn_cmp (ws, b1, n) < 0) + { + cy = mpn_add_n_sub_n (v3, v1, b1, ws, n); + v3[n] = cy >> 1; + v1[n] = 0; + sign = ~sign; + } + else + { + mp_limb_t cy2; + cy2 = mpn_add_n_sub_n (v3, v1, ws, b1, n); + v3[n] = cy + (cy2 >> 1); + v1[n] = cy - (cy2 & 1); + } +#else + v3[n] = cy + mpn_add_n (v3, ws, b1, n); + if (cy == 0 && mpn_cmp (ws, b1, n) < 0) + { + mpn_sub_n (v1, b1, ws, n); + v1[n] = 0; + sign = ~sign; + } + else + { + cy -= mpn_sub_n (v1, ws, b1, n); + v1[n] = cy; + } +#endif + TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-1)*B(-1) */ + TOOM_63_MUL_N_REC(r7, v2, v3, n + 1, ws); /* A(1)*B(1) */ + mpn_toom_couple_handling (r7, 2*n+1, pp, sign, n, 0, 0); + + /* $\pm2$ */ + sign = mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp); + pp[n] = mpn_lshift (pp, b1, n, 1); /* 2b1 */ + /* FIXME: use addlsh or addlsh2 */ + v3[t] = mpn_lshift (v3, b2, t, 2);/* 4b2 */ + if ( n == t ) + v3[n]+= mpn_add_n (v3, v3, b0, n); /* 4b2+b0 */ + else + v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 4b2+b0 */ + sign ^= abs_sub_add_n (v1, v3, pp, n + 1); + TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-2)*B(-2) */ + TOOM_63_MUL_N_REC(r5, v2, v3, n + 1, ws); /* A(+2)*B(+2) */ + mpn_toom_couple_handling (r5, 2*n+1, pp, sign, n, 1, 2); + + /* A(0)*B(0) */ + TOOM_63_MUL_N_REC(pp, ap, bp, n, ws); + + /* Infinity */ + if (s > t) { + TOOM_63_MUL_REC(r1, a5, s, b2, t, ws); + } else { + TOOM_63_MUL_REC(r1, b2, t, a5, s, ws); + }; + + mpn_toom_interpolate_8pts (pp, n, r3, r7, s + t, ws); + +#undef a5 +#undef b0 +#undef b1 +#undef b2 +#undef r1 +#undef r3 +#undef r5 +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef r7 +#undef r8 +#undef ws +} diff --git a/gmp-6.3.0/mpn/generic/toom6_sqr.c b/gmp-6.3.0/mpn/generic/toom6_sqr.c new file mode 100644 index 0000000..336eef9 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom6_sqr.c @@ -0,0 +1,181 @@ +/* Implementation of the squaring algorithm with Toom-Cook 6.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#if GMP_NUMB_BITS < 21 +#error Not implemented. +#endif + + +#if TUNE_PROGRAM_BUILD +#define MAYBE_sqr_basecase 1 +#define MAYBE_sqr_above_basecase 1 +#define MAYBE_sqr_toom2 1 +#define MAYBE_sqr_above_toom2 1 +#define MAYBE_sqr_toom3 1 +#define MAYBE_sqr_above_toom3 1 +#define MAYBE_sqr_above_toom4 1 +#else +#ifdef SQR_TOOM8_THRESHOLD +#define SQR_TOOM6_MAX ((SQR_TOOM8_THRESHOLD+6*2-1+5)/6) +#else +#define SQR_TOOM6_MAX \ + ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (6*2-1+5)) ? \ + ((SQR_FFT_THRESHOLD+6*2-1+5)/6) \ + : MP_SIZE_T_MAX ) +#endif +#define MAYBE_sqr_basecase \ + (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_above_basecase \ + (SQR_TOOM6_MAX >= SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_toom2 \ + (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_above_toom2 \ + (SQR_TOOM6_MAX >= SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_toom3 \ + (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM4_THRESHOLD) +#define MAYBE_sqr_above_toom3 \ + (SQR_TOOM6_MAX >= SQR_TOOM4_THRESHOLD) +#define MAYBE_sqr_above_toom4 \ + (SQR_TOOM6_MAX >= SQR_TOOM6_THRESHOLD) +#endif + +#define TOOM6_SQR_REC(p, a, n, ws) \ + do { \ + if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase \ + || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) \ + mpn_sqr_basecase (p, a, n); \ + else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2 \ + || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) \ + mpn_toom2_sqr (p, a, n, ws); \ + else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3 \ + || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) \ + mpn_toom3_sqr (p, a, n, ws); \ + else if (! MAYBE_sqr_above_toom4 \ + || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) \ + mpn_toom4_sqr (p, a, n, ws); \ + else \ + mpn_toom6_sqr (p, a, n, ws); \ + } while (0) + +void +mpn_toom6_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) +{ + mp_size_t n, s; + + /***************************** decomposition *******************************/ + + ASSERT( an >= 18 ); + + n = 1 + (an - 1) / (size_t) 6; + + s = an - 5 * n; + + ASSERT (0 < s && s <= n); + +#define r4 (pp + 3 * n) /* 3n+1 */ +#define r2 (pp + 7 * n) /* 3n+1 */ +#define r0 (pp +11 * n) /* s+t <= 2*n */ +#define r5 (scratch) /* 3n+1 */ +#define r3 (scratch + 3 * n + 1) /* 3n+1 */ +#define r1 (scratch + 6 * n + 2) /* 3n+1 */ +#define v0 (pp + 7 * n) /* n+1 */ +#define v2 (pp + 9 * n+2) /* n+1 */ +#define wse (scratch + 9 * n + 3) /* 3n+1 */ + + /* Alloc also 3n+1 limbs for ws... toom_interpolate_12pts may + need all of them, when DO_mpn_sublsh_n usea a scratch */ +/* if (scratch== NULL) */ +/* scratch = TMP_SALLOC_LIMBS (12 * n + 6); */ + + /********************** evaluation and recursive calls *********************/ + /* $\pm1/2$ */ + mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 1, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/2)*B(-1/2)*2^. */ + TOOM6_SQR_REC(r5, v2, n + 1, wse); /* A(+1/2)*B(+1/2)*2^. */ + mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 1, 0); + + /* $\pm1$ */ + mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1)*B(-1) */ + TOOM6_SQR_REC(r3, v2, n + 1, wse); /* A(1)*B(1) */ + mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 0, 0); + + /* $\pm4$ */ + mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-4)*B(-4) */ + TOOM6_SQR_REC(r1, v2, n + 1, wse); /* A(+4)*B(+4) */ + mpn_toom_couple_handling (r1, 2 * n + 1, pp, 0, n, 2, 4); + + /* $\pm1/4$ */ + mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 2, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/4)*B(-1/4)*4^. */ + TOOM6_SQR_REC(r4, v2, n + 1, wse); /* A(+1/4)*B(+1/4)*4^. */ + mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 2, 0); + + /* $\pm2$ */ + mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-2)*B(-2) */ + TOOM6_SQR_REC(r2, v2, n + 1, wse); /* A(+2)*B(+2) */ + mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 1, 2); + +#undef v0 +#undef v2 + + /* A(0)*B(0) */ + TOOM6_SQR_REC(pp, ap, n, wse); + + mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, 2 * s, 0, wse); + +#undef r0 +#undef r1 +#undef r2 +#undef r3 +#undef r4 +#undef r5 + +} +#undef TOOM6_SQR_REC +#undef MAYBE_sqr_basecase +#undef MAYBE_sqr_above_basecase +#undef MAYBE_sqr_toom2 +#undef MAYBE_sqr_above_toom2 +#undef MAYBE_sqr_toom3 +#undef MAYBE_sqr_above_toom3 +#undef MAYBE_sqr_above_toom4 diff --git a/gmp-6.3.0/mpn/generic/toom6h_mul.c b/gmp-6.3.0/mpn/generic/toom6h_mul.c new file mode 100644 index 0000000..637f2a5 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom6h_mul.c @@ -0,0 +1,262 @@ +/* Implementation of the multiplication algorithm for Toom-Cook 6.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#if GMP_NUMB_BITS < 21 +#error Not implemented. +#endif + +#if TUNE_PROGRAM_BUILD +#define MAYBE_mul_basecase 1 +#define MAYBE_mul_toom22 1 +#define MAYBE_mul_toom33 1 +#define MAYBE_mul_toom6h 1 +#else +#define MAYBE_mul_basecase \ + (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM22_THRESHOLD) +#define MAYBE_mul_toom22 \ + (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM33_THRESHOLD) +#define MAYBE_mul_toom33 \ + (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM44_THRESHOLD) +#define MAYBE_mul_toom6h \ + (MUL_FFT_THRESHOLD >= 6 * MUL_TOOM6H_THRESHOLD) +#endif + +#define TOOM6H_MUL_N_REC(p, a, b, f, p2, a2, b2, n, ws) \ + do { \ + if (MAYBE_mul_basecase \ + && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) { \ + mpn_mul_basecase (p, a, n, b, n); \ + if (f) \ + mpn_mul_basecase (p2, a2, n, b2, n); \ + } else if (MAYBE_mul_toom22 \ + && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) { \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + if (f) \ + mpn_toom22_mul (p2, a2, n, b2, n, ws); \ + } else if (MAYBE_mul_toom33 \ + && BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) { \ + mpn_toom33_mul (p, a, n, b, n, ws); \ + if (f) \ + mpn_toom33_mul (p2, a2, n, b2, n, ws); \ + } else if (! MAYBE_mul_toom6h \ + || BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) { \ + mpn_toom44_mul (p, a, n, b, n, ws); \ + if (f) \ + mpn_toom44_mul (p2, a2, n, b2, n, ws); \ + } else { \ + mpn_toom6h_mul (p, a, n, b, n, ws); \ + if (f) \ + mpn_toom6h_mul (p2, a2, n, b2, n, ws); \ + } \ + } while (0) + +#define TOOM6H_MUL_REC(p, a, na, b, nb, ws) \ + do { mpn_mul (p, a, na, b, nb); \ + } while (0) + +/* Toom-6.5 , compute the product {pp,an+bn} <- {ap,an} * {bp,bn} + With: an >= bn >= 46, an*6 < bn * 17. + It _may_ work with bn<=46 and bn*17 < an*6 < bn*18 + + Evaluate in: infinity, +4, -4, +2, -2, +1, -1, +1/2, -1/2, +1/4, -1/4, 0. +*/ +/* Estimate on needed scratch: + S(n) <= (n+5)\6*10+4+MAX(S((n+5)\6),1+2*(n+5)\6), + since n>42; S(n) <= ceil(log(n)/log(6))*(10+4)+n*12\6 < n*2 + lg2(n)*6 + */ + +void +mpn_toom6h_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + int p, q, half; + int sign; + + /***************************** decomposition *******************************/ + + ASSERT (an >= bn); + /* Can not handle too much unbalancement */ + ASSERT (bn >= 42); + /* Can not handle too much unbalancement */ + ASSERT ((an*3 < bn * 8) || (bn >= 46 && an * 6 < bn * 17)); + + /* Limit num/den is a rational number between + (12/11)^(log(4)/log(2*4-1)) and (12/11)^(log(6)/log(2*6-1)) */ +#define LIMIT_numerator (18) +#define LIMIT_denominat (17) + + if (LIKELY (an * LIMIT_denominat < LIMIT_numerator * bn)) /* is 6*... < 6*... */ + { + n = 1 + (an - 1) / (size_t) 6; + p = q = 5; + half = 0; + + s = an - 5 * n; + t = bn - 5 * n; + } + else { + if (an * 5 * LIMIT_numerator < LIMIT_denominat * 7 * bn) + { p = 7; q = 6; } + else if (an * 5 * LIMIT_denominat < LIMIT_numerator * 7 * bn) + { p = 7; q = 5; } + else if (an * LIMIT_numerator < LIMIT_denominat * 2 * bn) /* is 4*... < 8*... */ + { p = 8; q = 5; } + else if (an * LIMIT_denominat < LIMIT_numerator * 2 * bn) /* is 4*... < 8*... */ + { p = 8; q = 4; } + else + { p = 9; q = 4; } + + half = (p ^ q) & 1; + n = 1 + (q * an >= p * bn ? (an - 1) / (size_t) p : (bn - 1) / (size_t) q); + p--; q--; + + s = an - p * n; + t = bn - q * n; + + /* With LIMIT = 16/15, the following recover is needed only if bn<=73*/ + if (half) { /* Recover from badly chosen splitting */ + if (UNLIKELY (s<1)) {p--; s+=n; half=0;} + else if (UNLIKELY (t<1)) {q--; t+=n; half=0;} + } + } +#undef LIMIT_numerator +#undef LIMIT_denominat + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + ASSERT (half || s + t > 3); + ASSERT (n > 2); + +#define r4 (pp + 3 * n) /* 3n+1 */ +#define r2 (pp + 7 * n) /* 3n+1 */ +#define r0 (pp +11 * n) /* s+t <= 2*n */ +#define r5 (scratch) /* 3n+1 */ +#define r3 (scratch + 3 * n + 1) /* 3n+1 */ +#define r1 (scratch + 6 * n + 2) /* 3n+1 */ +#define v0 (pp + 7 * n) /* n+1 */ +#define v1 (pp + 8 * n+1) /* n+1 */ +#define v2 (pp + 9 * n+2) /* n+1 */ +#define v3 (scratch + 9 * n + 3) /* n+1 */ +#define wsi (scratch + 9 * n + 3) /* 3n+1 */ +#define wse (scratch +10 * n + 4) /* 2n+1 */ + + /* Alloc also 3n+1 limbs for wsi... toom_interpolate_12pts may + need all of them */ +/* if (scratch == NULL) */ +/* scratch = TMP_SALLOC_LIMBS(mpn_toom6_sqr_itch(n * 6)); */ + ASSERT (12 * n + 6 <= mpn_toom6h_mul_itch(an,bn)); + ASSERT (12 * n + 6 <= mpn_toom6_sqr_itch(n * 6)); + + /********************** evaluation and recursive calls *********************/ + /* $\pm1/2$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 1, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 1, pp); + /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r5, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r5, 2 * n + 1, pp, sign, n, 1+half , half); + + /* $\pm1$ */ + sign = mpn_toom_eval_pm1 (v2, v0, p, ap, n, s, pp); + if (UNLIKELY (q == 3)) + sign ^= mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp); + else + sign ^= mpn_toom_eval_pm1 (v3, v1, q, bp, n, t, pp); + /* A(-1)*B(-1) */ /* A(1)*B(1) */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r3, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r3, 2 * n + 1, pp, sign, n, 0, 0); + + /* $\pm4$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 2, pp) ^ + mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 2, pp); + /* A(-4)*B(-4) */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r1, v2, v3, n + 1, wse); /* A(+4)*B(+4) */ + mpn_toom_couple_handling (r1, 2 * n + 1, pp, sign, n, 2, 4); + + /* $\pm1/4$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 2, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 2, pp); + /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r4, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r4, 2 * n + 1, pp, sign, n, 2*(1+half), 2*(half)); + + /* $\pm2$ */ + sign = mpn_toom_eval_pm2 (v2, v0, p, ap, n, s, pp) ^ + mpn_toom_eval_pm2 (v3, v1, q, bp, n, t, pp); + /* A(-2)*B(-2) */ /* A(+2)*B(+2) */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r2, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r2, 2 * n + 1, pp, sign, n, 1, 2); + +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef wse + + /* A(0)*B(0) */ + TOOM6H_MUL_N_REC(pp, ap, bp, 0, pp, ap, bp, n, wsi); + + /* Infinity */ + if (UNLIKELY (half != 0)) { + if (s > t) { + TOOM6H_MUL_REC(r0, ap + p * n, s, bp + q * n, t, wsi); + } else { + TOOM6H_MUL_REC(r0, bp + q * n, t, ap + p * n, s, wsi); + }; + }; + + mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, s+t, half, wsi); + +#undef r0 +#undef r1 +#undef r2 +#undef r3 +#undef r4 +#undef r5 +#undef wsi +} + +#undef TOOM6H_MUL_N_REC +#undef TOOM6H_MUL_REC +#undef MAYBE_mul_basecase +#undef MAYBE_mul_toom22 +#undef MAYBE_mul_toom33 +#undef MAYBE_mul_toom6h diff --git a/gmp-6.3.0/mpn/generic/toom8_sqr.c b/gmp-6.3.0/mpn/generic/toom8_sqr.c new file mode 100644 index 0000000..03e5c64 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom8_sqr.c @@ -0,0 +1,225 @@ +/* Implementation of the squaring algorithm with Toom-Cook 8.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +#if GMP_NUMB_BITS < 29 +#error Not implemented. +#endif + +#if GMP_NUMB_BITS < 43 +#define BIT_CORRECTION 1 +#define CORRECTION_BITS GMP_NUMB_BITS +#else +#define BIT_CORRECTION 0 +#define CORRECTION_BITS 0 +#endif + +#ifndef SQR_TOOM8_THRESHOLD +#define SQR_TOOM8_THRESHOLD MUL_TOOM8H_THRESHOLD +#endif + +#ifndef SQR_TOOM6_THRESHOLD +#define SQR_TOOM6_THRESHOLD MUL_TOOM6H_THRESHOLD +#endif + +#if TUNE_PROGRAM_BUILD +#define MAYBE_sqr_basecase 1 +#define MAYBE_sqr_above_basecase 1 +#define MAYBE_sqr_toom2 1 +#define MAYBE_sqr_above_toom2 1 +#define MAYBE_sqr_toom3 1 +#define MAYBE_sqr_above_toom3 1 +#define MAYBE_sqr_toom4 1 +#define MAYBE_sqr_above_toom4 1 +#define MAYBE_sqr_above_toom6 1 +#else +#define SQR_TOOM8_MAX \ + ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (8*2-1+7)) ? \ + ((SQR_FFT_THRESHOLD+8*2-1+7)/8) \ + : MP_SIZE_T_MAX ) +#define MAYBE_sqr_basecase \ + (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_above_basecase \ + (SQR_TOOM8_MAX >= SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_toom2 \ + (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_above_toom2 \ + (SQR_TOOM8_MAX >= SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_toom3 \ + (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM4_THRESHOLD) +#define MAYBE_sqr_above_toom3 \ + (SQR_TOOM8_MAX >= SQR_TOOM4_THRESHOLD) +#define MAYBE_sqr_toom4 \ + (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM6_THRESHOLD) +#define MAYBE_sqr_above_toom4 \ + (SQR_TOOM8_MAX >= SQR_TOOM6_THRESHOLD) +#define MAYBE_sqr_above_toom6 \ + (SQR_TOOM8_MAX >= SQR_TOOM8_THRESHOLD) +#endif + +#define TOOM8_SQR_REC(p, a, f, p2, a2, n, ws) \ + do { \ + if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase \ + || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) { \ + mpn_sqr_basecase (p, a, n); \ + if (f) mpn_sqr_basecase (p2, a2, n); \ + } else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2 \ + || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) { \ + mpn_toom2_sqr (p, a, n, ws); \ + if (f) mpn_toom2_sqr (p2, a2, n, ws); \ + } else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3 \ + || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) { \ + mpn_toom3_sqr (p, a, n, ws); \ + if (f) mpn_toom3_sqr (p2, a2, n, ws); \ + } else if (MAYBE_sqr_toom4 && ( !MAYBE_sqr_above_toom4 \ + || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))) { \ + mpn_toom4_sqr (p, a, n, ws); \ + if (f) mpn_toom4_sqr (p2, a2, n, ws); \ + } else if (! MAYBE_sqr_above_toom6 \ + || BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { \ + mpn_toom6_sqr (p, a, n, ws); \ + if (f) mpn_toom6_sqr (p2, a2, n, ws); \ + } else { \ + mpn_toom8_sqr (p, a, n, ws); \ + if (f) mpn_toom8_sqr (p2, a2, n, ws); \ + } \ + } while (0) + +void +mpn_toom8_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) +{ + mp_size_t n, s; + + /***************************** decomposition *******************************/ + + ASSERT ( an >= 40 ); + + n = 1 + ((an - 1)>>3); + + s = an - 7 * n; + + ASSERT (0 < s && s <= n); + ASSERT ( s + s > 3 ); + +#define r6 (pp + 3 * n) /* 3n+1 */ +#define r4 (pp + 7 * n) /* 3n+1 */ +#define r2 (pp +11 * n) /* 3n+1 */ +#define r0 (pp +15 * n) /* s+t <= 2*n */ +#define r7 (scratch) /* 3n+1 */ +#define r5 (scratch + 3 * n + 1) /* 3n+1 */ +#define r3 (scratch + 6 * n + 2) /* 3n+1 */ +#define r1 (scratch + 9 * n + 3) /* 3n+1 */ +#define v0 (pp +11 * n) /* n+1 */ +#define v2 (pp +13 * n+2) /* n+1 */ +#define wse (scratch +12 * n + 4) /* 3n+1 */ + + /* Alloc also 3n+1 limbs for ws... toom_interpolate_16pts may + need all of them, when DO_mpn_sublsh_n usea a scratch */ +/* if (scratch == NULL) */ +/* scratch = TMP_SALLOC_LIMBS (30 * n + 6); */ + + /********************** evaluation and recursive calls *********************/ + /* $\pm1/8$ */ + mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 3, pp); + /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */ + TOOM8_SQR_REC(pp, v0, 2, r7, v2, n + 1, wse); + mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 0); + + /* $\pm1/4$ */ + mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 2, pp); + /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */ + TOOM8_SQR_REC(pp, v0, 2, r5, v2, n + 1, wse); + mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 2, 0); + + /* $\pm2$ */ + mpn_toom_eval_pm2 (v2, v0, 7, ap, n, s, pp); + /* A(-2)*B(-2) */ /* A(+2)*B(+2) */ + TOOM8_SQR_REC(pp, v0, 2, r3, v2, n + 1, wse); + mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 1, 2); + + /* $\pm8$ */ + mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 3, pp); + /* A(-8)*B(-8) */ /* A(+8)*B(+8) */ + TOOM8_SQR_REC(pp, v0, 2, r1, v2, n + 1, wse); + mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 6); + + /* $\pm1/2$ */ + mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 1, pp); + /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */ + TOOM8_SQR_REC(pp, v0, 2, r6, v2, n + 1, wse); + mpn_toom_couple_handling (r6, 2 * n + 1, pp, 0, n, 1, 0); + + /* $\pm1$ */ + mpn_toom_eval_pm1 (v2, v0, 7, ap, n, s, pp); + /* A(-1)*B(-1) */ /* A(1)*B(1) */ + TOOM8_SQR_REC(pp, v0, 2, r4, v2, n + 1, wse); + mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 0, 0); + + /* $\pm4$ */ + mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 2, pp); + /* A(-4)*B(-4) */ /* A(+4)*B(+4) */ + TOOM8_SQR_REC(pp, v0, 2, r2, v2, n + 1, wse); + mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 2, 4); + +#undef v0 +#undef v2 + + /* A(0)*B(0) */ + TOOM8_SQR_REC(pp, ap, 0, pp, ap, n, wse); + + mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, 2 * s, 0, wse); + +#undef r0 +#undef r1 +#undef r2 +#undef r3 +#undef r4 +#undef r5 +#undef r6 +#undef wse + +} + +#undef TOOM8_SQR_REC +#undef MAYBE_sqr_basecase +#undef MAYBE_sqr_above_basecase +#undef MAYBE_sqr_toom2 +#undef MAYBE_sqr_above_toom2 +#undef MAYBE_sqr_toom3 +#undef MAYBE_sqr_above_toom3 +#undef MAYBE_sqr_above_toom4 diff --git a/gmp-6.3.0/mpn/generic/toom8h_mul.c b/gmp-6.3.0/mpn/generic/toom8h_mul.c new file mode 100644 index 0000000..5ba259a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom8h_mul.c @@ -0,0 +1,305 @@ +/* Implementation of the multiplication algorithm for Toom-Cook 8.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#if GMP_NUMB_BITS < 29 +#error Not implemented. +#endif + +#if GMP_NUMB_BITS < 43 +#define BIT_CORRECTION 1 +#define CORRECTION_BITS GMP_NUMB_BITS +#else +#define BIT_CORRECTION 0 +#define CORRECTION_BITS 0 +#endif + + +#if TUNE_PROGRAM_BUILD +#define MAYBE_mul_basecase 1 +#define MAYBE_mul_toom22 1 +#define MAYBE_mul_toom33 1 +#define MAYBE_mul_toom44 1 +#define MAYBE_mul_toom8h 1 +#else +#define MAYBE_mul_basecase \ + (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM22_THRESHOLD) +#define MAYBE_mul_toom22 \ + (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM33_THRESHOLD) +#define MAYBE_mul_toom33 \ + (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM44_THRESHOLD) +#define MAYBE_mul_toom44 \ + (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM6H_THRESHOLD) +#define MAYBE_mul_toom8h \ + (MUL_FFT_THRESHOLD >= 8 * MUL_TOOM8H_THRESHOLD) +#endif + +#define TOOM8H_MUL_N_REC(p, a, b, f, p2, a2, b2, n, ws) \ + do { \ + if (MAYBE_mul_basecase \ + && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) { \ + mpn_mul_basecase (p, a, n, b, n); \ + if (f) mpn_mul_basecase (p2, a2, n, b2, n); \ + } else if (MAYBE_mul_toom22 \ + && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) { \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom22_mul (p2, a2, n, b2, n, ws); \ + } else if (MAYBE_mul_toom33 \ + && BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) { \ + mpn_toom33_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom33_mul (p2, a2, n, b2, n, ws); \ + } else if (MAYBE_mul_toom44 \ + && BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) { \ + mpn_toom44_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom44_mul (p2, a2, n, b2, n, ws); \ + } else if (! MAYBE_mul_toom8h \ + || BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) { \ + mpn_toom6h_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom6h_mul (p2, a2, n, b2, n, ws); \ + } else { \ + mpn_toom8h_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom8h_mul (p2, a2, n, b2, n, ws); \ + } \ + } while (0) + +#define TOOM8H_MUL_REC(p, a, na, b, nb, ws) \ + do { mpn_mul (p, a, na, b, nb); } while (0) + +/* Toom-8.5 , compute the product {pp,an+bn} <- {ap,an} * {bp,bn} + With: an >= bn >= 86, an*5 < bn * 11. + It _may_ work with bn<=?? and bn*?? < an*? < bn*?? + + Evaluate in: infinity, +8,-8,+4,-4,+2,-2,+1,-1,+1/2,-1/2,+1/4,-1/4,+1/8,-1/8,0. +*/ +/* Estimate on needed scratch: + S(n) <= (n+7)\8*13+5+MAX(S((n+7)\8),1+2*(n+7)\8), + since n>80; S(n) <= ceil(log(n/10)/log(8))*(13+5)+n*15\8 < n*15\8 + lg2(n)*6 + */ + +void +mpn_toom8h_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + int p, q, half; + int sign; + + /***************************** decomposition *******************************/ + + ASSERT (an >= bn); + /* Can not handle too small operands */ + ASSERT (bn >= 86); + /* Can not handle too much unbalancement */ + ASSERT (an <= bn*4); + ASSERT (GMP_NUMB_BITS > 11*3 || an*4 <= bn*11); + ASSERT (GMP_NUMB_BITS > 10*3 || an*1 <= bn* 2); + ASSERT (GMP_NUMB_BITS > 9*3 || an*2 <= bn* 3); + + /* Limit num/den is a rational number between + (16/15)^(log(6)/log(2*6-1)) and (16/15)^(log(8)/log(2*8-1)) */ +#define LIMIT_numerator (21) +#define LIMIT_denominat (20) + + if (LIKELY (an == bn) || an * (LIMIT_denominat>>1) < LIMIT_numerator * (bn>>1) ) /* is 8*... < 8*... */ + { + half = 0; + n = 1 + ((an - 1)>>3); + p = q = 7; + s = an - 7 * n; + t = bn - 7 * n; + } + else + { + if (an * 13 < 16 * bn) /* (an*7*LIMIT_numerator>1) < (LIMIT_numerator/7*9) * (bn>>1)) + { p = 9; q = 7; } + else if (an * 10 < 33 * (bn>>1)) /* (an*3*LIMIT_numerator= p * bn ? (an - 1) / (size_t) p : (bn - 1) / (size_t) q); + p--; q--; + + s = an - p * n; + t = bn - q * n; + + if(half) { /* Recover from badly chosen splitting */ + if (UNLIKELY (s<1)) {p--; s+=n; half=0;} + else if (UNLIKELY (t<1)) {q--; t+=n; half=0;} + } + } +#undef LIMIT_numerator +#undef LIMIT_denominat + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + ASSERT (half || s + t > 3); + ASSERT (n > 2); + +#define r6 (pp + 3 * n) /* 3n+1 */ +#define r4 (pp + 7 * n) /* 3n+1 */ +#define r2 (pp +11 * n) /* 3n+1 */ +#define r0 (pp +15 * n) /* s+t <= 2*n */ +#define r7 (scratch) /* 3n+1 */ +#define r5 (scratch + 3 * n + 1) /* 3n+1 */ +#define r3 (scratch + 6 * n + 2) /* 3n+1 */ +#define r1 (scratch + 9 * n + 3) /* 3n+1 */ +#define v0 (pp +11 * n) /* n+1 */ +#define v1 (pp +12 * n+1) /* n+1 */ +#define v2 (pp +13 * n+2) /* n+1 */ +#define v3 (scratch +12 * n + 4) /* n+1 */ +#define wsi (scratch +12 * n + 4) /* 3n+1 */ +#define wse (scratch +13 * n + 5) /* 2n+1 */ + + /* Alloc also 3n+1 limbs for wsi... toom_interpolate_16pts may + need all of them */ +/* if (scratch == NULL) */ +/* scratch = TMP_SALLOC_LIMBS(mpn_toom8_sqr_itch(n * 8)); */ + ASSERT (15 * n + 6 <= mpn_toom8h_mul_itch (an, bn)); + ASSERT (15 * n + 6 <= mpn_toom8_sqr_itch (n * 8)); + + /********************** evaluation and recursive calls *********************/ + + /* $\pm1/8$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 3, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 3, pp); + /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r7, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, sign, n, 3*(1+half), 3*(half)); + + /* $\pm1/4$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 2, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 2, pp); + /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r5, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r5, 2 * n + 1, pp, sign, n, 2*(1+half), 2*(half)); + + /* $\pm2$ */ + sign = mpn_toom_eval_pm2 (v2, v0, p, ap, n, s, pp) ^ + mpn_toom_eval_pm2 (v3, v1, q, bp, n, t, pp); + /* A(-2)*B(-2) */ /* A(+2)*B(+2) */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r3, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r3, 2 * n + 1, pp, sign, n, 1, 2); + + /* $\pm8$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 3, pp) ^ + mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 3, pp); + /* A(-8)*B(-8) */ /* A(+8)*B(+8) */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r1, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, sign, n, 3, 6); + + /* $\pm1/2$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 1, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 1, pp); + /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r6, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r6, 2 * n + 1, pp, sign, n, 1+half, half); + + /* $\pm1$ */ + sign = mpn_toom_eval_pm1 (v2, v0, p, ap, n, s, pp); + if (GMP_NUMB_BITS > 12*3 && UNLIKELY (q == 3)) + sign ^= mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp); + else + sign ^= mpn_toom_eval_pm1 (v3, v1, q, bp, n, t, pp); + /* A(-1)*B(-1) */ /* A(1)*B(1) */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r4, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r4, 2 * n + 1, pp, sign, n, 0, 0); + + /* $\pm4$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 2, pp) ^ + mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 2, pp); + /* A(-4)*B(-4) */ /* A(+4)*B(+4) */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r2, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r2, 2 * n + 1, pp, sign, n, 2, 4); + +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef wse + + /* A(0)*B(0) */ + TOOM8H_MUL_N_REC(pp, ap, bp, 0, pp, ap, bp, n, wsi); + + /* Infinity */ + if (UNLIKELY (half != 0)) { + if (s > t) { + TOOM8H_MUL_REC(r0, ap + p * n, s, bp + q * n, t, wsi); + } else { + TOOM8H_MUL_REC(r0, bp + q * n, t, ap + p * n, s, wsi); + }; + }; + + mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, s+t, half, wsi); + +#undef r0 +#undef r1 +#undef r2 +#undef r3 +#undef r4 +#undef r5 +#undef r6 +#undef wsi +} + +#undef TOOM8H_MUL_N_REC +#undef TOOM8H_MUL_REC +#undef MAYBE_mul_basecase +#undef MAYBE_mul_toom22 +#undef MAYBE_mul_toom33 +#undef MAYBE_mul_toom44 +#undef MAYBE_mul_toom8h diff --git a/gmp-6.3.0/mpn/generic/toom_couple_handling.c b/gmp-6.3.0/mpn/generic/toom_couple_handling.c new file mode 100644 index 0000000..cd253f7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_couple_handling.c @@ -0,0 +1,80 @@ +/* Helper function for high degree Toom-Cook algorithms. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Gets {pp,n} and (sign?-1:1)*{np,n}. Computes at once: + {pp,n} <- ({pp,n}+{np,n})/2^{ps+1} + {pn,n} <- ({pp,n}-{np,n})/2^{ns+1} + Finally recompose them obtaining: + {pp,n+off} <- {pp,n}+{np,n}*2^{off*GMP_NUMB_BITS} +*/ +void +mpn_toom_couple_handling (mp_ptr pp, mp_size_t n, mp_ptr np, + int nsign, mp_size_t off, int ps, int ns) +{ + if (nsign) { +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (np, pp, np, n); +#else + mpn_sub_n (np, pp, np, n); + mpn_rshift (np, np, n, 1); +#endif + } else { +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (np, pp, np, n); +#else + mpn_add_n (np, pp, np, n); + mpn_rshift (np, np, n, 1); +#endif + } + +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + if (ps == 1) + mpn_rsh1sub_n (pp, pp, np, n); + else +#endif + { + mpn_sub_n (pp, pp, np, n); + if (ps > 0) + mpn_rshift (pp, pp, n, ps); + } + if (ns > 0) + mpn_rshift (np, np, n, ns); + pp[n] = mpn_add_n (pp+off, pp+off, np, n-off); + ASSERT_NOCARRY (mpn_add_1(pp+n, np+n-off, off, pp[n]) ); +} diff --git a/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c b/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c new file mode 100644 index 0000000..5f491b6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c @@ -0,0 +1,72 @@ +/* mpn_toom_eval_dgr3_pm1 -- Evaluate a degree 3 polynomial in +1 and -1 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +int +mpn_toom_eval_dgr3_pm1 (mp_ptr xp1, mp_ptr xm1, + mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp) +{ + int neg; + + ASSERT (x3n > 0); + ASSERT (x3n <= n); + + xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n); + tp[n] = mpn_add (tp, xp + n, n, xp + 3*n, x3n); + + neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1); + else + mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1); +#else + if (neg) + mpn_sub_n (xm1, tp, xp1, n + 1); + else + mpn_sub_n (xm1, xp1, tp, n + 1); + + mpn_add_n (xp1, xp1, tp, n + 1); +#endif + + ASSERT (xp1[n] <= 3); + ASSERT (xm1[n] <= 1); + + return neg; +} diff --git a/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c b/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c new file mode 100644 index 0000000..55e6b89 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c @@ -0,0 +1,97 @@ +/* mpn_toom_eval_dgr3_pm2 -- Evaluate a degree 3 polynomial in +2 and -2 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Needs n+1 limbs of temporary storage. */ +int +mpn_toom_eval_dgr3_pm2 (mp_ptr xp2, mp_ptr xm2, + mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp) +{ + mp_limb_t cy; + int neg; + + ASSERT (x3n > 0); + ASSERT (x3n <= n); + + /* (x0 + 4 * x2) +/- (2 x1 + 8 x_3) */ +#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n +#if HAVE_NATIVE_mpn_addlsh2_n + xp2[n] = mpn_addlsh2_n (xp2, xp, xp + 2*n, n); + + cy = mpn_addlsh2_n (tp, xp + n, xp + 3*n, x3n); +#else /* HAVE_NATIVE_mpn_addlsh_n */ + xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2); + + cy = mpn_addlsh_n (tp, xp + n, xp + 3*n, x3n, 2); +#endif + if (x3n < n) + cy = mpn_add_1 (tp + x3n, xp + n + x3n, n - x3n, cy); + tp[n] = cy; +#else + cy = mpn_lshift (tp, xp + 2*n, n, 2); + xp2[n] = cy + mpn_add_n (xp2, tp, xp, n); + + tp[x3n] = mpn_lshift (tp, xp + 3*n, x3n, 2); + if (x3n < n) + tp[n] = mpn_add (tp, xp + n, n, tp, x3n + 1); + else + tp[n] += mpn_add_n (tp, xp + n, tp, n); +#endif + mpn_lshift (tp, tp, n+1, 1); + + neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); + else + mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1); +#else + if (neg) + mpn_sub_n (xm2, tp, xp2, n + 1); + else + mpn_sub_n (xm2, xp2, tp, n + 1); + + mpn_add_n (xp2, xp2, tp, n + 1); +#endif + + ASSERT (xp2[n] < 15); + ASSERT (xm2[n] < 10); + + return neg; +} diff --git a/gmp-6.3.0/mpn/generic/toom_eval_pm1.c b/gmp-6.3.0/mpn/generic/toom_eval_pm1.c new file mode 100644 index 0000000..a8cfa93 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_pm1.c @@ -0,0 +1,89 @@ +/* mpn_toom_eval_pm1 -- Evaluate a polynomial in +1 and -1 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluates a polynomial of degree k > 3, in the points +1 and -1. */ +int +mpn_toom_eval_pm1 (mp_ptr xp1, mp_ptr xm1, unsigned k, + mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp) +{ + unsigned i; + int neg; + + ASSERT (k >= 4); + + ASSERT (hn > 0); + ASSERT (hn <= n); + + /* The degree k is also the number of full-size coefficients, so + * that last coefficient, of size hn, starts at xp + k*n. */ + + xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n); + for (i = 4; i < k; i += 2) + ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+i*n, n)); + + tp[n] = mpn_add_n (tp, xp + n, xp + 3*n, n); + for (i = 5; i < k; i += 2) + ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+i*n, n)); + + if (k & 1) + ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+k*n, hn)); + else + ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+k*n, hn)); + + neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1); + else + mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1); +#else + if (neg) + mpn_sub_n (xm1, tp, xp1, n + 1); + else + mpn_sub_n (xm1, xp1, tp, n + 1); + + mpn_add_n (xp1, xp1, tp, n + 1); +#endif + + ASSERT (xp1[n] <= k); + ASSERT (xm1[n] <= k/2 + 1); + + return neg; +} diff --git a/gmp-6.3.0/mpn/generic/toom_eval_pm2.c b/gmp-6.3.0/mpn/generic/toom_eval_pm2.c new file mode 100644 index 0000000..be682c7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_pm2.c @@ -0,0 +1,130 @@ +/* mpn_toom_eval_pm2 -- Evaluate a polynomial in +2 and -2 + + Contributed to the GNU project by Niels Möller and Marco Bodrato + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* DO_addlsh2(d,a,b,n,cy) computes cy,{d,n} <- {a,n} + 4*(cy,{b,n}), it + can be used as DO_addlsh2(d,a,d,n,d[n]), for accumulation on {d,n+1}. */ +#if HAVE_NATIVE_mpn_addlsh2_n +#define DO_addlsh2(d, a, b, n, cy) \ +do { \ + (cy) <<= 2; \ + (cy) += mpn_addlsh2_n(d, a, b, n); \ +} while (0) +#else +#if HAVE_NATIVE_mpn_addlsh_n +#define DO_addlsh2(d, a, b, n, cy) \ +do { \ + (cy) <<= 2; \ + (cy) += mpn_addlsh_n(d, a, b, n, 2); \ +} while (0) +#else +/* The following is not a general substitute for addlsh2. + It is correct if d == b, but it is not if d == a. */ +#define DO_addlsh2(d, a, b, n, cy) \ +do { \ + (cy) <<= 2; \ + (cy) += mpn_lshift(d, b, n, 2); \ + (cy) += mpn_add_n(d, d, a, n); \ +} while (0) +#endif +#endif + +/* Evaluates a polynomial of degree 2 < k < GMP_NUMB_BITS, in the + points +2 and -2. */ +int +mpn_toom_eval_pm2 (mp_ptr xp2, mp_ptr xm2, unsigned k, + mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp) +{ + int i; + int neg; + mp_limb_t cy; + + ASSERT (k >= 3); + ASSERT (k < GMP_NUMB_BITS); + + ASSERT (hn > 0); + ASSERT (hn <= n); + + /* The degree k is also the number of full-size coefficients, so + * that last coefficient, of size hn, starts at xp + k*n. */ + + cy = 0; + DO_addlsh2 (xp2, xp + (k-2) * n, xp + k * n, hn, cy); + if (hn != n) + cy = mpn_add_1 (xp2 + hn, xp + (k-2) * n + hn, n - hn, cy); + for (i = k - 4; i >= 0; i -= 2) + DO_addlsh2 (xp2, xp + i * n, xp2, n, cy); + xp2[n] = cy; + + k--; + + cy = 0; + DO_addlsh2 (tp, xp + (k-2) * n, xp + k * n, n, cy); + for (i = k - 4; i >= 0; i -= 2) + DO_addlsh2 (tp, xp + i * n, tp, n, cy); + tp[n] = cy; + + if (k & 1) + ASSERT_NOCARRY(mpn_lshift (tp , tp , n + 1, 1)); + else + ASSERT_NOCARRY(mpn_lshift (xp2, xp2, n + 1, 1)); + + neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); + else + mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1); +#else /* !HAVE_NATIVE_mpn_add_n_sub_n */ + if (neg) + mpn_sub_n (xm2, tp, xp2, n + 1); + else + mpn_sub_n (xm2, xp2, tp, n + 1); + + mpn_add_n (xp2, xp2, tp, n + 1); +#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */ + + ASSERT (xp2[n] < (1<<(k+2))-1); + ASSERT (xm2[n] < ((1<<(k+3))-1 - (1^k&1))/3); + + neg ^= ((k & 1) - 1); + + return neg; +} + +#undef DO_addlsh2 diff --git a/gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c b/gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c new file mode 100644 index 0000000..c3c4651 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c @@ -0,0 +1,127 @@ +/* mpn_toom_eval_pm2exp -- Evaluate a polynomial in +2^k and -2^k + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */ +int +mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k, + mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift, + mp_ptr tp) +{ + unsigned i; + int neg; +#if HAVE_NATIVE_mpn_addlsh_n + mp_limb_t cy; +#endif + + ASSERT (k >= 3); + ASSERT (shift*k < GMP_NUMB_BITS); + + ASSERT (hn > 0); + ASSERT (hn <= n); + + /* The degree k is also the number of full-size coefficients, so + * that last coefficient, of size hn, starts at xp + k*n. */ + +#if HAVE_NATIVE_mpn_addlsh_n + xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift); + for (i = 4; i < k; i += 2) + xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift); + + tp[n] = mpn_lshift (tp, xp+n, n, shift); + for (i = 3; i < k; i+= 2) + tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift); + + if (k & 1) + { + cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift); + MPN_INCR_U (tp + hn, n+1 - hn, cy); + } + else + { + cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift); + MPN_INCR_U (xp2 + hn, n+1 - hn, cy); + } + +#else /* !HAVE_NATIVE_mpn_addlsh_n */ + xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift); + xp2[n] += mpn_add_n (xp2, xp, tp, n); + for (i = 4; i < k; i += 2) + { + xp2[n] += mpn_lshift (tp, xp + i*n, n, i*shift); + xp2[n] += mpn_add_n (xp2, xp2, tp, n); + } + + tp[n] = mpn_lshift (tp, xp+n, n, shift); + for (i = 3; i < k; i+= 2) + { + tp[n] += mpn_lshift (xm2, xp + i*n, n, i*shift); + tp[n] += mpn_add_n (tp, tp, xm2, n); + } + + xm2[hn] = mpn_lshift (xm2, xp + k*n, hn, k*shift); + if (k & 1) + mpn_add (tp, tp, n+1, xm2, hn+1); + else + mpn_add (xp2, xp2, n+1, xm2, hn+1); +#endif /* !HAVE_NATIVE_mpn_addlsh_n */ + + neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); + else + mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1); +#else /* !HAVE_NATIVE_mpn_add_n_sub_n */ + if (neg) + mpn_sub_n (xm2, tp, xp2, n + 1); + else + mpn_sub_n (xm2, xp2, tp, n + 1); + + mpn_add_n (xp2, xp2, tp, n + 1); +#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */ + + /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */ + ASSERT ((k+1)*shift >= GMP_LIMB_BITS || + xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<= GMP_LIMB_BITS || + xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<= 3. */ +int +mpn_toom_eval_pm2rexp (mp_ptr rp, mp_ptr rm, + unsigned int q, mp_srcptr ap, mp_size_t n, mp_size_t t, + unsigned int s, mp_ptr ws) +{ + unsigned int i; + int neg; + /* {ap,q*n+t} -> {rp,n+1} {rm,n+1} , with {ws, n+1}*/ + ASSERT (n >= t); + ASSERT (s != 0); /* or _eval_pm1 should be used */ + ASSERT (q > 1); + ASSERT (s*q < GMP_NUMB_BITS); + rp[n] = mpn_lshift(rp, ap, n, s*q); + ws[n] = mpn_lshift(ws, ap+n, n, s*(q-1)); + if( (q & 1) != 0) { + ASSERT_NOCARRY(mpn_add(ws,ws,n+1,ap+n*q,t)); + rp[n] += DO_mpn_addlsh_n(rp, ap+n*(q-1), n, s, rm); + } else { + ASSERT_NOCARRY(mpn_add(rp,rp,n+1,ap+n*q,t)); + } + for(i=2; i> s); \ + __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ + MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ +} while (0) +#endif + + +#define BINVERT_9 \ + ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) + +#define BINVERT_255 \ + (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) + + /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */ +#if GMP_LIMB_BITS == 32 +#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) +#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) +#else +#if GMP_LIMB_BITS == 64 +#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) +#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) +#endif +#endif + +#ifndef mpn_divexact_by255 +#if GMP_NUMB_BITS % 8 == 0 +#define mpn_divexact_by255(dst,src,size) \ + (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) +#else +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) +#else +#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) +#endif +#endif +#endif + +#ifndef mpn_divexact_by9x4 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2) +#else +#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2) +#endif +#endif + +#ifndef mpn_divexact_by42525 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) +#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0) +#else +#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)) +#endif +#endif + +#ifndef mpn_divexact_by2835x4 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) +#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2) +#else +#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2) +#endif +#endif + +/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation + points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely, + we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of + degree 11 (or 10), given the 12 (rsp. 11) values: + + r0 = limit at infinity of f(x) / x^11, + r1 = f(4),f(-4), + r2 = f(2),f(-2), + r3 = f(1),f(-1), + r4 = f(1/4),f(-1/4), + r5 = f(1/2),f(-1/2), + r6 = f(0). + + All couples of the form f(n),f(-n) must be already mixed with + toom_couple_handling(f(n),...,f(-n),...) + + The result is stored in {pp, spt + 7*n (or 6*n)}. + At entry, r6 is stored at {pp, 2n}, + r4 is stored at {pp + 3n, 3n + 1}. + r2 is stored at {pp + 7n, 3n + 1}. + r0 is stored at {pp +11n, spt}. + + The other values are 3n+1 limbs each (with most significant limbs small). + + Negative intermediate results are stored two-complemented. + Inputs are destroyed. +*/ + +void +mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, + mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) +{ + mp_limb_t cy; + mp_size_t n3; + mp_size_t n3p1; + n3 = 3 * n; + n3p1 = n3 + 1; + +#define r4 (pp + n3) /* 3n+1 */ +#define r2 (pp + 7 * n) /* 3n+1 */ +#define r0 (pp +11 * n) /* s+t <= 2*n */ + + /******************************* interpolation *****************************/ + if (half != 0) { + cy = mpn_sub_n (r3, r3, r0, spt); + MPN_DECR_U (r3 + spt, n3p1 - spt, cy); + + cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi); + MPN_DECR_U (r2 + spt, n3p1 - spt, cy); + DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi); + + cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi); + MPN_DECR_U (r1 + spt, n3p1 - spt, cy); + DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi); + }; + + r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi); + DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi); + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r1, r4, r4, r1, n3p1); +#else + ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1)); + mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */ + MP_PTR_SWAP(r1, wsi); +#endif + + r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi); + DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi); + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); +#else + mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ + ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); + MP_PTR_SWAP(r5, wsi); +#endif + + r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n); + +#if AORSMUL_FASTER_AORS_AORSLSH + mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */ +#else + mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */ + DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */ +#endif + /* A division by 2835x4 follows. Warning: the operand can be negative! */ + mpn_divexact_by2835x4(r4, r4, n3p1); + if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) + r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); + +#if AORSMUL_FASTER_2AORSLSH + mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */ +#else + DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */ + DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */ +#endif + mpn_divexact_by255(r5, r5, n3p1); + + ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi)); + +#if AORSMUL_FASTER_3AORSLSH + ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100)); +#else + ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi)); + ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi)); + ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi)); +#endif + ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi)); + mpn_divexact_by42525(r1, r1, n3p1); + +#if AORSMUL_FASTER_AORS_2AORSLSH + ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225)); +#else + ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1)); + ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi)); + ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi)); +#endif + mpn_divexact_by9x4(r2, r2, n3p1); + + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (r4, r2, r4, n3p1); + r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_sub_n (r4, r2, r4, n3p1); + ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1)); +#endif + ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (r5, r5, r1, n3p1); + r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_add_n (r5, r5, r1, n3p1); + ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); +#endif + + /* last interpolation steps... */ + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); + ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1)); + /* ... could be mixed with recomposition + ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1| + */ + + /***************************** recomposition *******************************/ + /* + pp[] prior to operations: + |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp + + summation scheme for remaining operations: + |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp + |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp + ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| + */ + + cy = mpn_add_n (pp + n, pp + n, r5, n); + cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy); +#if HAVE_NATIVE_mpn_add_nc + cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy); +#else + MPN_INCR_U (r5 + 2 * n, n + 1, cy); + cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n); +#endif + MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy); + + pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n); + cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]); +#if HAVE_NATIVE_mpn_add_nc + cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy); +#else + MPN_INCR_U (r3 + 2 * n, n + 1, cy); + cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n); +#endif + MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); + + pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n); + if (half) { + cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]); +#if HAVE_NATIVE_mpn_add_nc + if (LIKELY (spt > n)) { + cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy); + MPN_INCR_U (pp + 4 * n3, spt - n, cy); + } else { + ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy)); + } +#else + MPN_INCR_U (r1 + 2 * n, n + 1, cy); + if (LIKELY (spt > n)) { + cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n); + MPN_INCR_U (pp + 4 * n3, spt - n, cy); + } else { + ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt)); + } +#endif + } else { + ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n])); + } + +#undef r0 +#undef r2 +#undef r4 +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c new file mode 100644 index 0000000..c1457be --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c @@ -0,0 +1,545 @@ +/* Interpolation for the algorithm Toom-Cook 8.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#if GMP_NUMB_BITS < 29 +#error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB. +#endif + +#if GMP_NUMB_BITS < 28 +#error Not implemented: divexact_by188513325 and _by182712915 will not work. +#endif + + +/* FIXME: tuneup should decide the best variant */ +#ifndef AORSMUL_FASTER_AORS_AORSLSH +#define AORSMUL_FASTER_AORS_AORSLSH 1 +#endif +#ifndef AORSMUL_FASTER_AORS_2AORSLSH +#define AORSMUL_FASTER_AORS_2AORSLSH 1 +#endif +#ifndef AORSMUL_FASTER_2AORSLSH +#define AORSMUL_FASTER_2AORSLSH 1 +#endif +#ifndef AORSMUL_FASTER_3AORSLSH +#define AORSMUL_FASTER_3AORSLSH 1 +#endif + + +#if HAVE_NATIVE_mpn_sublsh_n +#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s) +#else +static mp_limb_t +DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) +{ +#if USE_MUL_1 && 0 + return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); +#else + mp_limb_t __cy; + __cy = mpn_lshift(ws,src,n,s); + return __cy + mpn_sub_n(dst,dst,ws,n); +#endif +} +#endif + +#if HAVE_NATIVE_mpn_addlsh_n +#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s) +#else +#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH) +static mp_limb_t +DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) +{ +#if USE_MUL_1 && 0 + return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s)); +#else + mp_limb_t __cy; + __cy = mpn_lshift(ws,src,n,s); + return __cy + mpn_add_n(dst,dst,ws,n); +#endif +} +#endif +#endif + +#if HAVE_NATIVE_mpn_subrsh +#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s) +#else +/* FIXME: This is not a correct definition, it assumes no carry */ +#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ +do { \ + mp_limb_t __cy; \ + MPN_DECR_U (dst, nd, src[0] >> s); \ + __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ + MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ +} while (0) +#endif + + +#if GMP_NUMB_BITS < 43 +#define BIT_CORRECTION 1 +#define CORRECTION_BITS GMP_NUMB_BITS +#else +#define BIT_CORRECTION 0 +#define CORRECTION_BITS 0 +#endif + +#define BINVERT_9 \ + ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) + +#define BINVERT_255 \ + (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) + + /* FIXME: find some more general expressions for inverses */ +#if GMP_LIMB_BITS == 32 +#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) +#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) +#define BINVERT_182712915 (GMP_NUMB_MASK & CNST_LIMB(0x550659DB)) +#define BINVERT_188513325 (GMP_NUMB_MASK & CNST_LIMB(0xFBC333A5)) +#define BINVERT_255x182712915L (GMP_NUMB_MASK & CNST_LIMB(0x6FC4CB25)) +#define BINVERT_255x188513325L (GMP_NUMB_MASK & CNST_LIMB(0x6864275B)) +#if GMP_NAIL_BITS == 0 +#define BINVERT_255x182712915H CNST_LIMB(0x1B649A07) +#define BINVERT_255x188513325H CNST_LIMB(0x06DB993A) +#else /* GMP_NAIL_BITS != 0 */ +#define BINVERT_255x182712915H \ + (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<>GMP_NUMB_BITS))) +#define BINVERT_255x188513325H \ + (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<>GMP_NUMB_BITS))) +#endif +#else +#if GMP_LIMB_BITS == 64 +#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) +#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) +#define BINVERT_255x182712915 (GMP_NUMB_MASK & CNST_LIMB(0x1B649A076FC4CB25)) +#define BINVERT_255x188513325 (GMP_NUMB_MASK & CNST_LIMB(0x06DB993A6864275B)) +#endif +#endif + +#ifndef mpn_divexact_by255 +#if GMP_NUMB_BITS % 8 == 0 +#define mpn_divexact_by255(dst,src,size) \ + (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) +#else +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) +#else +#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) +#endif +#endif +#endif + +#ifndef mpn_divexact_by255x4 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2) +#else +#define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2) +#endif +#endif + +#ifndef mpn_divexact_by9x16 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4) +#else +#define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4) +#endif +#endif + +#ifndef mpn_divexact_by42525x16 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) +#define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4) +#else +#define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4) +#endif +#endif + +#ifndef mpn_divexact_by2835x64 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) +#define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6) +#else +#define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6) +#endif +#endif + +#ifndef mpn_divexact_by255x182712915 +#if GMP_NUMB_BITS < 36 +#if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H) +/* FIXME: use mpn_bdiv_q_2_pi2 */ +#endif +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915) +#define mpn_divexact_by255x182712915(dst,src,size) \ + do { \ + mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0); \ + mpn_divexact_by255(dst,dst,size); \ + } while(0) +#else +#define mpn_divexact_by255x182712915(dst,src,size) \ + do { \ + mpn_divexact_1(dst,src,size,CNST_LIMB(182712915)); \ + mpn_divexact_by255(dst,dst,size); \ + } while(0) +#endif +#else /* GMP_NUMB_BITS > 35 */ +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915) +#define mpn_divexact_by255x182712915(dst,src,size) \ + mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0) +#else +#define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915)) +#endif +#endif /* GMP_NUMB_BITS >?< 36 */ +#endif + +#ifndef mpn_divexact_by255x188513325 +#if GMP_NUMB_BITS < 36 +#if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H) +/* FIXME: use mpn_bdiv_q_1_pi2 */ +#endif +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325) +#define mpn_divexact_by255x188513325(dst,src,size) \ + do { \ + mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0); \ + mpn_divexact_by255(dst,dst,size); \ + } while(0) +#else +#define mpn_divexact_by255x188513325(dst,src,size) \ + do { \ + mpn_divexact_1(dst,src,size,CNST_LIMB(188513325)); \ + mpn_divexact_by255(dst,dst,size); \ + } while(0) +#endif +#else /* GMP_NUMB_BITS > 35 */ +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325) +#define mpn_divexact_by255x188513325(dst,src,size) \ + mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0) +#else +#define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325)) +#endif +#endif /* GMP_NUMB_BITS >?< 36 */ +#endif + +/* Interpolation for Toom-8.5 (or Toom-8), using the evaluation + points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2, + +-1/8, 0. More precisely, we want to compute + f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or + 14), given the 16 (rsp. 15) values: + + r0 = limit at infinity of f(x) / x^15, + r1 = f(8),f(-8), + r2 = f(4),f(-4), + r3 = f(2),f(-2), + r4 = f(1),f(-1), + r5 = f(1/4),f(-1/4), + r6 = f(1/2),f(-1/2), + r7 = f(1/8),f(-1/8), + r8 = f(0). + + All couples of the form f(n),f(-n) must be already mixed with + toom_couple_handling(f(n),...,f(-n),...) + + The result is stored in {pp, spt + 7*n (or 8*n)}. + At entry, r8 is stored at {pp, 2n}, + r6 is stored at {pp + 3n, 3n + 1}. + r4 is stored at {pp + 7n, 3n + 1}. + r2 is stored at {pp +11n, 3n + 1}. + r0 is stored at {pp +15n, spt}. + + The other values are 3n+1 limbs each (with most significant limbs small). + + Negative intermediate results are stored two-complemented. + Inputs are destroyed. +*/ + +void +mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7, + mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) +{ + mp_limb_t cy; + mp_size_t n3; + mp_size_t n3p1; + n3 = 3 * n; + n3p1 = n3 + 1; + +#define r6 (pp + n3) /* 3n+1 */ +#define r4 (pp + 7 * n) /* 3n+1 */ +#define r2 (pp +11 * n) /* 3n+1 */ +#define r0 (pp +15 * n) /* s+t <= 2*n */ + + ASSERT( spt <= 2 * n ); + /******************************* interpolation *****************************/ + if( half != 0) { + cy = mpn_sub_n (r4, r4, r0, spt); + MPN_DECR_U (r4 + spt, n3p1 - spt, cy); + + cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi); + MPN_DECR_U (r3 + spt, n3p1 - spt, cy); + DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi); + + cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi); + MPN_DECR_U (r2 + spt, n3p1 - spt, cy); + DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi); + + cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi); +#if BIT_CORRECTION + cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION, + n3p1 - spt - BIT_CORRECTION, cy); + ASSERT (BIT_CORRECTION > 0 || cy == 0); + /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */ + cy = r7[n3p1]; + r7[n3p1] = 0x80; +#else + MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy); +#endif + DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi); +#if BIT_CORRECTION + /* FIXME: assumes r7[n3p1] is writable. */ + ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 ); + r7[n3p1] = cy; +#endif + }; + + r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi); + DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi); + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); +#else + mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ + ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); + MP_PTR_SWAP(r5, wsi); +#endif + + r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi); + DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi); + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r3, r6, r6, r3, n3p1); +#else + ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1)); + mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */ + MP_PTR_SWAP(r3, wsi); +#endif + + cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi); +#if BIT_CORRECTION + MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6); + cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi); + cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy); + ASSERT ( BIT_CORRECTION > 0 || cy != 0 ); +#else + r7[n3] -= cy; + DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi); +#endif + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r1, r7, r7, r1, n3p1); +#else + mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */ + mpn_add_n (r1, r1, r7, n3p1); /* if BIT_CORRECTION != 0, can give a carry. */ + MP_PTR_SWAP(r7, wsi); +#endif + + r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n); + +#if AORSMUL_FASTER_2AORSLSH + mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */ +#else + DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */ + DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */ +#endif + + mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */ +#if AORSMUL_FASTER_3AORSLSH + mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */ +#else + DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */ + DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */ + DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */ +#endif + mpn_divexact_by255x188513325(r7, r7, n3p1); + + mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */ + /* A division by 2835x64 follows. Warning: the operand can be negative! */ + mpn_divexact_by2835x64(r5, r5, n3p1); + if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0) + r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6)); + +#if AORSMUL_FASTER_AORS_AORSLSH + mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */ +#else + mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */ + DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */ +#endif +#if AORSMUL_FASTER_2AORSLSH + mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */ +#else + DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */ + DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */ +#endif + /* A division by 255x4 follows. Warning: the operand can be negative! */ + mpn_divexact_by255x4(r6, r6, n3p1); + if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) + r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); + + ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi)); + + ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi)); + ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400)); + + /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/ + DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi); + mpn_submul_1 (r1, r2, n3p1, 1428); + mpn_submul_1 (r1, r3, n3p1, 112896); + mpn_divexact_by255x182712915(r1, r1, n3p1); + + ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425)); + mpn_divexact_by42525x16(r2, r2, n3p1); + +#if AORSMUL_FASTER_AORS_2AORSLSH + ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969)); +#else + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); + ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi)); + ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi)); +#endif + ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900)); + mpn_divexact_by9x16(r3, r3, n3p1); + + ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1)); + ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1)); + ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (r6, r2, r6, n3p1); + r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_add_n (r6, r2, r6, n3p1); + ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1)); +#endif + ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (r5, r3, r5, n3p1); + r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_sub_n (r5, r3, r5, n3p1); + ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); +#endif + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (r7, r1, r7, n3p1); + r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_add_n (r7, r1, r7, n3p1); + ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1)); +#endif + ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1)); + + /* last interpolation steps... */ + /* ... could be mixed with recomposition + ||H-r7|M-r7|L-r7| ||H-r5|M-r5|L-r5| + */ + + /***************************** recomposition *******************************/ + /* + pp[] prior to operations: + |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp + + summation scheme for remaining operations: + |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp + |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp + ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| ||H r7|M r7|L r7| + */ + + cy = mpn_add_n (pp + n, pp + n, r7, n); + cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy); +#if HAVE_NATIVE_mpn_add_nc + cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy); +#else + MPN_INCR_U (r7 + 2 * n, n + 1, cy); + cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n); +#endif + MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy); + + pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n); + cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]); +#if HAVE_NATIVE_mpn_add_nc + cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy); +#else + MPN_INCR_U (r5 + 2 * n, n + 1, cy); + cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n); +#endif + MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); + + pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n); + cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]); +#if HAVE_NATIVE_mpn_add_nc + cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy); +#else + MPN_INCR_U (r3 + 2 * n, n + 1, cy); + cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n); +#endif + MPN_INCR_U (pp +12 * n, 2 * n + 1, cy); + + pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n); + if ( half ) { + cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]); +#if HAVE_NATIVE_mpn_add_nc + if(LIKELY(spt > n)) { + cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy); + MPN_INCR_U (pp + 16 * n, spt - n, cy); + } else { + ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy)); + } +#else + MPN_INCR_U (r1 + 2 * n, n + 1, cy); + if(LIKELY(spt > n)) { + cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n); + MPN_INCR_U (pp + 16 * n, spt - n, cy); + } else { + ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt)); + } +#endif + } else { + ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n])); + } + +#undef r0 +#undef r2 +#undef r4 +#undef r6 +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c new file mode 100644 index 0000000..466ab85 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c @@ -0,0 +1,198 @@ +/* mpn_toom_interpolate_5pts -- Interpolate for toom3, 33, 42. + + Contributed to the GNU project by Robert Harley. + Improvements by Paul Zimmermann and Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2000-2003, 2005-2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1, + mp_size_t k, mp_size_t twor, int sa, + mp_limb_t vinf0) +{ + mp_limb_t cy, saved; + mp_size_t twok; + mp_size_t kk1; + mp_ptr c1, v1, c3, vinf; + + twok = k + k; + kk1 = twok + 1; + + c1 = c + k; + v1 = c1 + k; + c3 = v1 + k; + vinf = c3 + k; + +#define v0 (c) + /* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) = + thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0) + */ + if (sa) + ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1)); + else + ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1)); + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */ + + ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ + /* (5 3 1 1 0)*/ + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */ + + /* (2) vm1 <- tm1 := (v1 - vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 = + tm1 >= 0 (0 1 0 1 0) + No carry comes out from {v1, kk1} +/- {vm1, kk1}, + and the division by two is exact. + If (sa!=0) the sign of vm1 is negative */ + if (sa) + { +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (vm1, v1, vm1, kk1); +#else + ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1)); + ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1)); +#endif + } + else + { +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (vm1, v1, vm1, kk1); +#else + ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1)); + ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1)); +#endif + } + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ + + /* (3) v1 <- t1 := v1 - v0 (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0) + t1 >= 0 + */ + vinf[0] -= mpn_sub_n (v1, v1, c, twok); + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1-v0 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ + + /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6 + t2 >= 0 [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0) + */ +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (v2, v2, v1, kk1); +#else + ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1)); + ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1)); +#endif + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1-v0 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */ + + /* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0) + result is v1 >= 0 + */ + ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1)); + + /* We do not need to read the value in vm1, so we add it in {c+k, ...} */ + cy = mpn_add_n (c1, c1, vm1, kk1); + MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */ + /* Memory allocated for vm1 is now free, it can be recycled ...*/ + + /* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0) + result is v2 >= 0 */ + saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */ + vinf[0] = vinf0; /* Set the right value for vinf0 */ +#ifdef HAVE_NATIVE_mpn_sublsh1_n_ip1 + cy = mpn_sublsh1_n_ip1 (v2, vinf, twor); +#else + /* Overwrite unused vm1 */ + cy = mpn_lshift (vm1, vinf, twor, 1); + cy += mpn_sub_n (v2, v2, vm1, twor); +#endif + MPN_DECR_U (v2 + twor, kk1 - twor, cy); + + /* Current matrix is + [1 0 0 0 0; vinf + 0 1 0 0 0; v2 + 1 0 1 0 0; v1 + 0 1 0 1 0; vm1 + 0 0 0 0 1] v0 + Some values already are in-place (we added vm1 in the correct position) + | vinf| v1 | v0 | + | vm1 | + One still is in a separated area + | +v2 | + We have to compute v1-=vinf; vm1 -= v2, + |-vinf| + | -v2 | + Carefully reordering operations we can avoid to compute twice the sum + of the high half of v2 plus the low half of vinf. + */ + + /* Add the high half of t2 in {vinf} */ + if ( LIKELY(twor > k + 1) ) { /* This is the expected flow */ + cy = mpn_add_n (vinf, vinf, v2 + k, k + 1); + MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */ + } else { /* triggered only by very unbalanced cases like + (k+k+(k-2))x(k+k+1) , should be handled by toom32 */ + ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor)); + } + /* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0) + result is >= 0 */ + /* Side effect: we also subtracted (high half) vm1 -= v2 */ + cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */ + vinf0 = vinf[0]; /* Save again the right value for vinf0 */ + vinf[0] = saved; + MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */ + + /* (8) vm1 <- vm1-v2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0) + Operate only on the low half. + */ + cy = mpn_sub_n (c1, c1, v2, k); + MPN_DECR_U (v1, kk1, cy); + + /********************* Beginning the final phase **********************/ + + /* Most of the recomposition was done */ + + /* add t2 in {c+3k, ...}, but only the low half */ + cy = mpn_add_n (c3, c3, v2, k); + vinf[0] += cy; + ASSERT(vinf[0] >= cy); /* No carry */ + MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */ + +#undef v0 +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c new file mode 100644 index 0000000..eb23661 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c @@ -0,0 +1,241 @@ +/* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52 + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#define BINVERT_3 MODLIMB_INVERSE_3 + +/* For odd divisors, mpn_divexact_1 works fine with two's complement. */ +#ifndef mpn_divexact_by3 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) +#else +#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) +#endif +#endif + +/* Interpolation for Toom-3.5, using the evaluation points: infinity, + 1, -1, 2, -2. More precisely, we want to compute + f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the + six values + + w5 = f(0), + w4 = f(-1), + w3 = f(1) + w2 = f(-2), + w1 = f(2), + w0 = limit at infinity of f(x) / x^5, + + The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at + {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at + {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most + significant limbs small). f(-1) and f(-2) may be negative, signs + determined by the flag bits. All intermediate results are positive. + Inputs are destroyed. + + Interpolation sequence was taken from the paper: "Integer and + Polynomial Multiplication: Towards Optimal Toom-Cook Matrices". + Some slight variations were introduced: adaptation to "gmp + instruction set", and a final saving of an operation by interlacing + interpolation and recomposition phases. +*/ + +void +mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags, + mp_ptr w4, mp_ptr w2, mp_ptr w1, + mp_size_t w0n) +{ + mp_limb_t cy; + /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */ + mp_limb_t cy4, cy6, embankment; + + ASSERT( n > 0 ); + ASSERT( 2*n >= w0n && w0n > 0 ); + +#define w5 pp /* 2n */ +#define w3 (pp + 2 * n) /* 2n+1 */ +#define w0 (pp + 5 * n) /* w0n */ + + /* Interpolate with sequence: + W2 =(W1 - W2)>>2 + W1 =(W1 - W5)>>1 + W1 =(W1 - W2)>>1 + W4 =(W3 - W4)>>1 + W2 =(W2 - W4)/3 + W3 = W3 - W4 - W5 + W1 =(W1 - W3)/3 + // Last steps are mixed with recomposition... + W2 = W2 - W0<<2 + W4 = W4 - W2 + W3 = W3 - W1 + W2 = W2 - W0 + */ + + /* W2 =(W1 - W2)>>2 */ + if (flags & toom6_vm2_neg) + mpn_add_n (w2, w1, w2, 2 * n + 1); + else + mpn_sub_n (w2, w1, w2, 2 * n + 1); + mpn_rshift (w2, w2, 2 * n + 1, 2); + + /* W1 =(W1 - W5)>>1 */ + w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n); + mpn_rshift (w1, w1, 2 * n + 1, 1); + + /* W1 =(W1 - W2)>>1 */ +#if HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1); +#else + mpn_sub_n (w1, w1, w2, 2 * n + 1); + mpn_rshift (w1, w1, 2 * n + 1, 1); +#endif + + /* W4 =(W3 - W4)>>1 */ + if (flags & toom6_vm1_neg) + { +#if HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (w4, w3, w4, 2 * n + 1); +#else + mpn_add_n (w4, w3, w4, 2 * n + 1); + mpn_rshift (w4, w4, 2 * n + 1, 1); +#endif + } + else + { +#if HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1); +#else + mpn_sub_n (w4, w3, w4, 2 * n + 1); + mpn_rshift (w4, w4, 2 * n + 1, 1); +#endif + } + + /* W2 =(W2 - W4)/3 */ + mpn_sub_n (w2, w2, w4, 2 * n + 1); + mpn_divexact_by3 (w2, w2, 2 * n + 1); + + /* W3 = W3 - W4 - W5 */ + mpn_sub_n (w3, w3, w4, 2 * n + 1); + w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n); + + /* W1 =(W1 - W3)/3 */ + mpn_sub_n (w1, w1, w3, 2 * n + 1); + mpn_divexact_by3 (w1, w1, 2 * n + 1); + + /* + [1 0 0 0 0 0; + 0 1 0 0 0 0; + 1 0 1 0 0 0; + 0 1 0 1 0 0; + 1 0 1 0 1 0; + 0 0 0 0 0 1] + + pp[] prior to operations: + |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| + + summation scheme for remaining operations: + |______________5|n_____4|n_____3|n_____2|n______|n______|pp + |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| + || H w4 | L w4 | + || H w2 | L w2 | + || H w1 | L w1 | + ||-H w1 |-L w1 | + |-H w0 |-L w0 ||-H w2 |-L w2 | + */ + cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1); + MPN_INCR_U (pp + 3 * n + 1, n, cy); + + /* W2 -= W0<<2 */ +#if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1 +#if HAVE_NATIVE_mpn_sublsh2_n_ip1 + cy = mpn_sublsh2_n_ip1 (w2, w0, w0n); +#else + cy = mpn_sublsh_n (w2, w2, w0, w0n, 2); +#endif +#else + /* {W4,2*n+1} is now free and can be overwritten. */ + cy = mpn_lshift(w4, w0, w0n, 2); + cy+= mpn_sub_n(w2, w2, w4, w0n); +#endif + MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy); + + /* W4L = W4L - W2L */ + cy = mpn_sub_n (pp + n, pp + n, w2, n); + MPN_DECR_U (w3, 2 * n + 1, cy); + + /* W3H = W3H + W2L */ + cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n); + /* W1L + W2H */ + cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n); + MPN_INCR_U (w1 + n, n + 1, cy); + + /* W0 = W0 + W1H */ + if (LIKELY (w0n > n)) + cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n); + else + cy6 = mpn_add_n (w0, w0, w1 + n, w0n); + + /* + summation scheme for the next operation: + |...____5|n_____4|n_____3|n_____2|n______|n______|pp + |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__| + ...-w0___|-w1_w2 | + */ + /* if(LIKELY(w0n>n)) the two operands below DO overlap! */ + cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n); + + /* embankment is a "dirty trick" to avoid carry/borrow propagation + beyond allocated memory */ + embankment = w0[w0n - 1] - 1; + w0[w0n - 1] = 1; + if (LIKELY (w0n > n)) { + if (cy4 > cy6) + MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6); + else + MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4); + MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy); + MPN_INCR_U (w0 + n, w0n - n, cy6); + } else { + MPN_INCR_U (pp + 4 * n, w0n + n, cy4); + MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6); + } + w0[w0n - 1] += embankment; + +#undef w5 +#undef w3 +#undef w0 + +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c new file mode 100644 index 0000000..167c45b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c @@ -0,0 +1,274 @@ +/* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62. + + Contributed to the GNU project by Niels Möller. + Improvements by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006, 2007, 2009, 2014, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#define BINVERT_3 MODLIMB_INVERSE_3 + +#define BINVERT_9 \ + ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) + +#define BINVERT_15 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15) + +/* For the various mpn_divexact_byN here, fall back to using either + mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is + many faster if it is native. For now, since mpn_divexact_1 is native on + several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use + mpn_pi1_bdiv_q_1 unconditionally. FIXME. */ + +/* For odd divisors, mpn_divexact_1 works fine with two's complement. */ +#ifndef mpn_divexact_by3 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) +#else +#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) +#endif +#endif + +#ifndef mpn_divexact_by9 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0) +#else +#define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9) +#endif +#endif + +#ifndef mpn_divexact_by15 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0) +#else +#define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15) +#endif +#endif + +/* Interpolation for toom4, using the evaluation points 0, infinity, + 1, -1, 2, -2, 1/2. More precisely, we want to compute + f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the + seven values + + w0 = f(0), + w1 = f(-2), + w2 = f(1), + w3 = f(-1), + w4 = f(2) + w5 = 64 * f(1/2) + w6 = limit at infinity of f(x) / x^6, + + The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n }, + w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n, + w6n }. The other values are 2n + 1 limbs each (with most + significant limbs small). f(-1) and f(-1/2) may be negative, signs + determined by the flag bits. Inputs are destroyed. + + Needs (2*n + 1) limbs of temporary storage. +*/ + +void +mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags, + mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5, + mp_size_t w6n, mp_ptr tp) +{ + mp_size_t m; + mp_limb_t cy; + + m = 2*n + 1; +#define w0 rp +#define w2 (rp + 2*n) +#define w6 (rp + 6*n) + + ASSERT (w6n > 0); + ASSERT (w6n <= 2*n); + + /* Using formulas similar to Marco Bodrato's + + W5 = W5 + W4 + W1 =(W4 - W1)/2 + W4 = W4 - W0 + W4 =(W4 - W1)/4 - W6*16 + W3 =(W2 - W3)/2 + W2 = W2 - W3 + + W5 = W5 - W2*65 May be negative. + W2 = W2 - W6 - W0 + W5 =(W5 + W2*45)/2 Now >= 0 again. + W4 =(W4 - W2)/3 + W2 = W2 - W4 + + W1 = W5 - W1 May be negative. + W5 =(W5 - W3*8)/9 + W3 = W3 - W5 + W1 =(W1/15 + W5)/2 Now >= 0 again. + W5 = W5 - W1 + + where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1), + W4 = f(2), W5 = f(1/2), W6 = f(oo), + + Note that most intermediate results are positive; the ones that + may be negative are represented in two's complement. We must + never shift right a value that may be negative, since that would + invalidate the sign bit. On the other hand, divexact by odd + numbers work fine with two's complement. + */ + + mpn_add_n (w5, w5, w4, m); + if (flags & toom7_w1_neg) + { +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (w1, w1, w4, m); +#else + mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1)); + mpn_rshift (w1, w1, m, 1); +#endif + } + else + { +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (w1, w4, w1, m); +#else + mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1)); + mpn_rshift (w1, w1, m, 1); +#endif + } + mpn_sub (w4, w4, m, w0, 2*n); + mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3)); + mpn_rshift (w4, w4, m, 2); /* w4>=0 */ + + tp[w6n] = mpn_lshift (tp, w6, w6n, 4); + mpn_sub (w4, w4, m, tp, w6n+1); + + if (flags & toom7_w3_neg) + { +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (w3, w3, w2, m); +#else + mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1)); + mpn_rshift (w3, w3, m, 1); +#endif + } + else + { +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (w3, w2, w3, m); +#else + mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1)); + mpn_rshift (w3, w3, m, 1); +#endif + } + + mpn_sub_n (w2, w2, w3, m); + + mpn_submul_1 (w5, w2, m, 65); + mpn_sub (w2, w2, m, w6, w6n); + mpn_sub (w2, w2, m, w0, 2*n); + + mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1)); + mpn_rshift (w5, w5, m, 1); + mpn_sub_n (w4, w4, w2, m); + + mpn_divexact_by3 (w4, w4, m); + mpn_sub_n (w2, w2, w4, m); + + mpn_sub_n (w1, w5, w1, m); + mpn_lshift (tp, w3, m, 3); + mpn_sub_n (w5, w5, tp, m); + mpn_divexact_by9 (w5, w5, m); + mpn_sub_n (w3, w3, w5, m); + + mpn_divexact_by15 (w1, w1, m); +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (w1, w1, w5, m); + w1[m - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1)); + mpn_rshift (w1, w1, m, 1); /* w1>=0 now */ +#endif + + mpn_sub_n (w5, w5, w1, m); + + /* These bounds are valid for the 4x4 polynomial product of toom44, + * and they are conservative for toom53 and toom62. */ + ASSERT (w1[2*n] < 2); + ASSERT (w2[2*n] < 3); + ASSERT (w3[2*n] < 4); + ASSERT (w4[2*n] < 3); + ASSERT (w5[2*n] < 2); + + /* Addition chain. Note carries and the 2n'th limbs that need to be + * added in. + * + * Special care is needed for w2[2n] and the corresponding carry, + * since the "simple" way of adding it all together would overwrite + * the limb at wp[2*n] and rp[4*n] (same location) with the sum of + * the high half of w3 and the low half of w4. + * + * 7 6 5 4 3 2 1 0 + * | | | | | | | | | + * ||w3 (2n+1)| + * ||w4 (2n+1)| + * ||w5 (2n+1)| ||w1 (2n+1)| + * + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r) + * ----------------------------------------------- + * r | | | | | | | | | + * c7 c6 c5 c4 c3 Carries to propagate + */ + + cy = mpn_add_n (rp + n, rp + n, w1, m); + MPN_INCR_U (w2 + n + 1, n , cy); + cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n); + MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy); + cy = mpn_add_n (rp + 4*n, w3 + n, w4, n); + MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy); + cy = mpn_add_n (rp + 5*n, w4 + n, w5, n); + MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy); + if (w6n > n + 1) + { + cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1); + MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy); + } + else + { + ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n)); +#if WANT_ASSERT + { + mp_size_t i; + for (i = w6n; i <= n; i++) + ASSERT (w5[n + i] == 0); + } +#endif + } +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c new file mode 100644 index 0000000..5e65fab --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c @@ -0,0 +1,211 @@ +/* mpn_toom_interpolate_8pts -- Interpolate for toom54, 63, 72. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#define BINVERT_3 MODLIMB_INVERSE_3 + +#define BINVERT_15 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15) + +#define BINVERT_45 ((BINVERT_15 * BINVERT_3) & GMP_NUMB_MASK) + +#ifndef mpn_divexact_by3 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) +#else +#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) +#endif +#endif + +#ifndef mpn_divexact_by45 +#if GMP_NUMB_BITS % 12 == 0 +#define mpn_divexact_by45(dst,src,size) \ + (63 & 19 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 45))) +#else +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by45(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,45,BINVERT_45,0) +#else +#define mpn_divexact_by45(dst,src,size) mpn_divexact_1(dst,src,size,45) +#endif +#endif +#endif + +#if HAVE_NATIVE_mpn_sublsh2_n_ip1 +#define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n_ip1(dst,src,n) +#else +#define DO_mpn_sublsh2_n(dst,src,n,ws) DO_mpn_sublsh_n(dst,src,n,2,ws) +#endif + +#if HAVE_NATIVE_mpn_sublsh_n +#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n (dst,dst,src,n,s) +#else +static mp_limb_t +DO_mpn_sublsh_n (mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) +{ +#if USE_MUL_1 && 0 + return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); +#else + mp_limb_t __cy; + __cy = mpn_lshift (ws,src,n,s); + return __cy + mpn_sub_n (dst,dst,ws,n); +#endif +} +#endif + + +#if HAVE_NATIVE_mpn_subrsh +#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh (dst,nd,src,ns,s) +#else +/* This is not a correct definition, it assumes no carry */ +#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ +do { \ + mp_limb_t __cy; \ + MPN_DECR_U (dst, nd, src[0] >> s); \ + __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ + MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ +} while (0) +#endif + +/* Interpolation for Toom-4.5 (or Toom-4), using the evaluation + points: infinity(4.5 only), 4, -4, 2, -2, 1, -1, 0. More precisely, + we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of + degree 7 (or 6), given the 8 (rsp. 7) values: + + r1 = limit at infinity of f(x) / x^7, + r2 = f(4), + r3 = f(-4), + r4 = f(2), + r5 = f(-2), + r6 = f(1), + r7 = f(-1), + r8 = f(0). + + All couples of the form f(n),f(-n) must be already mixed with + toom_couple_handling(f(n),...,f(-n),...) + + The result is stored in {pp, spt + 7*n (or 6*n)}. + At entry, r8 is stored at {pp, 2n}, + r5 is stored at {pp + 3n, 3n + 1}. + + The other values are 2n+... limbs each (with most significant limbs small). + + All intermediate results are positive. + Inputs are destroyed. +*/ + +void +mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n, + mp_ptr r3, mp_ptr r7, + mp_size_t spt, mp_ptr ws) +{ + mp_limb_signed_t cy; + mp_ptr r5, r1; + r5 = (pp + 3 * n); /* 3n+1 */ + r1 = (pp + 7 * n); /* spt */ + + /******************************* interpolation *****************************/ + + DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws); + cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws); + MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy); + + DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws); + cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws); + MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy); + + r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n); + cy = mpn_sub_n (r7, r7, r1, spt); + MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy); + + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1)); + ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2)); + + ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1)); + + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1)); + + mpn_divexact_by45 (r3, r3, 3 * n + 1); + + ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1)); + + ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws)); + + /* last interpolation steps... */ + /* ... are mixed with recomposition */ + + /***************************** recomposition *******************************/ + /* + pp[] prior to operations: + |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp + + summation scheme for remaining operations: + |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp + |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp + ||_H r3|_M r3|_L*r3| + ||_H_r7|_M_r7|_L_r7| + ||-H r3|-M r3|-L*r3| + ||-H*r5|-M_r5|-L_r5| + */ + + cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */ + cy-= mpn_sub_n (pp + n, pp + n, r5, n); + if (cy > 0) { + MPN_INCR_U (r7 + n, 2*n + 1, 1); + cy = 0; + } + + cy = mpn_sub_nc (pp + 2*n, r7 + n, r5 + n, n, -cy); /* Mr7-Mr5 */ + MPN_DECR_U (r7 + 2*n, n + 1, cy); + + cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */ + r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */ + cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */ + if (UNLIKELY(0 > cy)) + MPN_DECR_U (r5 + n + 1, 2*n, 1); + else + MPN_INCR_U (r5 + n + 1, 2*n, cy); + + ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */ + + cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]); + MPN_INCR_U (r3 + 2*n, n + 1, cy); + cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n); + if (LIKELY(spt != n)) + MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]); + else + ASSERT (r3[3*n] + cy == 0); +} diff --git a/gmp-6.3.0/mpn/generic/trialdiv.c b/gmp-6.3.0/mpn/generic/trialdiv.c new file mode 100644 index 0000000..65e089f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/trialdiv.c @@ -0,0 +1,131 @@ +/* mpn_trialdiv -- find small factors of an mpn number using trial division. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* + This function finds the first (smallest) factor represented in + trialdivtab.h. It does not stop the factoring effort just because it has + reached some sensible limit, such as the square root of the input number. + + The caller can limit the factoring effort by passing NPRIMES. The function + will then divide until that limit, or perhaps a few primes more. A position + which only mpn_trialdiv can make sense of is returned in the WHERE + parameter. It can be used for restarting the factoring effort; the first + call should pass 0 here. + + Input: 1. A non-negative number T = {tp,tn} + 2. NPRIMES as described above, + 3. *WHERE as described above. + Output: 1. *WHERE updated as described above. + 2. Return value is non-zero if we found a factor, else zero + To get the actual prime factor, compute the mod B inverse + of the return value. +*/ + +#include "gmp-impl.h" + +struct gmp_primes_dtab { + mp_limb_t binv; + mp_limb_t lim; +}; + +struct gmp_primes_ptab { + mp_limb_t ppp; /* primes, multiplied together */ + mp_limb_t cps[7]; /* ppp values pre-computed for mpn_mod_1s_4p */ + gmp_uint_least32_t idx:24; /* index of first primes in dtab */ + gmp_uint_least32_t np :8; /* number of primes related to this entry */ +}; + + +static const struct gmp_primes_dtab gmp_primes_dtab[] = +{ +#define WANT_dtab +#define P(p,inv,lim) {inv,lim} +#include "trialdivtab.h" +#undef WANT_dtab +#undef P + {0,0} +}; + +static const struct gmp_primes_ptab gmp_primes_ptab[] = +{ +#define WANT_ptab +#include "trialdivtab.h" +#undef WANT_ptab +}; + +#define PTAB_LINES (sizeof (gmp_primes_ptab) / sizeof (gmp_primes_ptab[0])) + +/* FIXME: We could optimize out one of the outer loop conditions if we + had a final ptab entry with a huge np field. */ +mp_limb_t +mpn_trialdiv (mp_srcptr tp, mp_size_t tn, mp_size_t nprimes, int *where) +{ + mp_limb_t ppp; + const mp_limb_t *cps; + const struct gmp_primes_dtab *dp; + long i, j, idx, np; + mp_limb_t r, q; + + ASSERT (tn >= 1); + + for (i = *where; i < PTAB_LINES; i++) + { + ppp = gmp_primes_ptab[i].ppp; + cps = gmp_primes_ptab[i].cps; + + r = mpn_mod_1s_4p (tp, tn, ppp << cps[1], cps); + + idx = gmp_primes_ptab[i].idx; + np = gmp_primes_ptab[i].np; + + /* Check divisibility by individual primes. */ + dp = &gmp_primes_dtab[idx] + np; + for (j = -np; j < 0; j++) + { + q = r * dp[j].binv; + if (q <= dp[j].lim) + { + *where = i; + return dp[j].binv; + } + } + + nprimes -= np; + if (nprimes <= 0) + return 0; + } + return 0; +} diff --git a/gmp-6.3.0/mpn/generic/udiv_w_sdiv.c b/gmp-6.3.0/mpn/generic/udiv_w_sdiv.c new file mode 100644 index 0000000..7907135 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/udiv_w_sdiv.c @@ -0,0 +1,141 @@ +/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed + division. + + Contributed by Peter L. Montgomery. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY SAFE + TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE + GNU MP RELEASE. + + +Copyright 1992, 1994, 1996, 2000, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_udiv_w_sdiv (mp_limb_t *rp, mp_limb_t a1, mp_limb_t a0, mp_limb_t d) +{ + mp_limb_t q, r; + mp_limb_t c0, c1, b1; + + ASSERT (d != 0); + ASSERT (a1 < d); + + if ((mp_limb_signed_t) d >= 0) + { + if (a1 < d - a1 - (a0 >> (GMP_LIMB_BITS - 1))) + { + /* dividend, divisor, and quotient are nonnegative */ + sdiv_qrnnd (q, r, a1, a0, d); + } + else + { + /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */ + sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (GMP_LIMB_BITS - 1)); + /* Divide (c1*2^32 + c0) by d */ + sdiv_qrnnd (q, r, c1, c0, d); + /* Add 2^31 to quotient */ + q += (mp_limb_t) 1 << (GMP_LIMB_BITS - 1); + } + } + else + { + b1 = d >> 1; /* d/2, between 2^30 and 2^31 - 1 */ + c1 = a1 >> 1; /* A/2 */ + c0 = (a1 << (GMP_LIMB_BITS - 1)) + (a0 >> 1); + + if (a1 < b1) /* A < 2^32*b1, so A/2 < 2^31*b1 */ + { + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + r = 2*r + (a0 & 1); /* Remainder from A/(2*b1) */ + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else if (c1 < b1) /* So 2^31 <= (A/2)/b1 < 2^32 */ + { + c1 = (b1 - 1) - c1; + c0 = ~c0; /* logical NOT */ + + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + q = ~q; /* (A/2)/b1 */ + r = (b1 - 1) - r; + + r = 2*r + (a0 & 1); /* A/(2*b1) */ + + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else /* Implies c1 = b1 */ + { /* Hence a1 = d - 1 = 2*b1 - 1 */ + if (a0 >= -d) + { + q = -CNST_LIMB(1); + r = a0 + d; + } + else + { + q = -CNST_LIMB(2); + r = a0 + 2*d; + } + } + } + + *rp = r; + return q; +} diff --git a/gmp-6.3.0/mpn/generic/zero.c b/gmp-6.3.0/mpn/generic/zero.c new file mode 100644 index 0000000..1a05453 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/zero.c @@ -0,0 +1,41 @@ +/* mpn_zero + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_zero (mp_ptr rp, mp_size_t n) +{ + mp_size_t i; + + rp += n; + for (i = -n; i != 0; i++) + rp[i] = 0; +} diff --git a/gmp-6.3.0/mpn/generic/zero_p.c b/gmp-6.3.0/mpn/generic/zero_p.c new file mode 100644 index 0000000..c92f9b8 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/zero_p.c @@ -0,0 +1,33 @@ +/* mpn_zero_p (x,xsize) -- Return 1 if X is zero, 0 if it is non-zero. + +Copyright 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_zero_p 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/get_d.c b/gmp-6.3.0/mpn/get_d.c new file mode 120000 index 0000000..5db509e --- /dev/null +++ b/gmp-6.3.0/mpn/get_d.c @@ -0,0 +1 @@ +../mpn/generic/get_d.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/get_str.c b/gmp-6.3.0/mpn/get_str.c new file mode 120000 index 0000000..3c7d15e --- /dev/null +++ b/gmp-6.3.0/mpn/get_str.c @@ -0,0 +1 @@ +../mpn/generic/get_str.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hamdist.asm b/gmp-6.3.0/mpn/hamdist.asm new file mode 120000 index 0000000..eedabc6 --- /dev/null +++ b/gmp-6.3.0/mpn/hamdist.asm @@ -0,0 +1 @@ +../mpn/x86/p6/p3mmx/popham.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hgcd.c b/gmp-6.3.0/mpn/hgcd.c new file mode 120000 index 0000000..575f459 --- /dev/null +++ b/gmp-6.3.0/mpn/hgcd.c @@ -0,0 +1 @@ +../mpn/generic/hgcd.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hgcd2.c b/gmp-6.3.0/mpn/hgcd2.c new file mode 120000 index 0000000..02e007e --- /dev/null +++ b/gmp-6.3.0/mpn/hgcd2.c @@ -0,0 +1 @@ +../mpn/generic/hgcd2.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hgcd2_jacobi.c b/gmp-6.3.0/mpn/hgcd2_jacobi.c new file mode 120000 index 0000000..4b6dfca --- /dev/null +++ b/gmp-6.3.0/mpn/hgcd2_jacobi.c @@ -0,0 +1 @@ +../mpn/generic/hgcd2_jacobi.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hgcd_appr.c b/gmp-6.3.0/mpn/hgcd_appr.c new file mode 120000 index 0000000..02afc66 --- /dev/null +++ b/gmp-6.3.0/mpn/hgcd_appr.c @@ -0,0 +1 @@ +../mpn/generic/hgcd_appr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hgcd_jacobi.c b/gmp-6.3.0/mpn/hgcd_jacobi.c new file mode 120000 index 0000000..c9dadcf --- /dev/null +++ b/gmp-6.3.0/mpn/hgcd_jacobi.c @@ -0,0 +1 @@ +../mpn/generic/hgcd_jacobi.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hgcd_matrix.c b/gmp-6.3.0/mpn/hgcd_matrix.c new file mode 120000 index 0000000..2e422f1 --- /dev/null +++ b/gmp-6.3.0/mpn/hgcd_matrix.c @@ -0,0 +1 @@ +../mpn/generic/hgcd_matrix.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hgcd_reduce.c b/gmp-6.3.0/mpn/hgcd_reduce.c new file mode 120000 index 0000000..3a34d29 --- /dev/null +++ b/gmp-6.3.0/mpn/hgcd_reduce.c @@ -0,0 +1 @@ +../mpn/generic/hgcd_reduce.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/hgcd_step.c b/gmp-6.3.0/mpn/hgcd_step.c new file mode 120000 index 0000000..36b9af3 --- /dev/null +++ b/gmp-6.3.0/mpn/hgcd_step.c @@ -0,0 +1 @@ +../mpn/generic/hgcd_step.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/ia64/README b/gmp-6.3.0/mpn/ia64/README new file mode 100644 index 0000000..45c2d63 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/README @@ -0,0 +1,281 @@ +Copyright 2000-2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + IA-64 MPN SUBROUTINES + + +This directory contains mpn functions for the IA-64 architecture. + + +CODE ORGANIZATION + + mpn/ia64 itanium-2, and generic ia64 + +The code here has been optimized primarily for Itanium 2. Very few Itanium 1 +chips were ever sold, and Itanium 2 is more powerful, so the latter is what +we concentrate on. + + + +CHIP NOTES + +The IA-64 ISA keeps instructions three and three in 128 bit bundles. +Programmers/compilers need to put explicit breaks `;;' when there are WAW or +RAW dependencies, with some notable exceptions. Such "breaks" are typically +at the end of a bundle, but can be put between operations within some bundle +types too. + +The Itanium 1 and Itanium 2 implementations can under ideal conditions +execute two bundles per cycle. The Itanium 1 allows 4 of these instructions +to do integer operations, while the Itanium 2 allows all 6 to be integer +operations. + +Taken cloop branches seem to insert a bubble into the pipeline most of the +time on Itanium 1. + +Loads to the fp registers bypass the L1 cache and thus get extremely long +latencies, 9 cycles on the Itanium 1 and 6 cycles on the Itanium 2. + +The software pipeline stuff using br.ctop instruction causes delays, since +many issue slots are taken up by instructions with zero predicates, and +since many extra instructions are needed to set things up. These features +are clearly designed for code density, not speed. + +Misc pipeline limitations (Itanium 1): +* The getf.sig instruction can only execute in M0. +* At most four integer instructions/cycle. +* Nops take up resources like any plain instructions. + +Misc pipeline limitations (Itanium 2): +* The getf.sig instruction can only execute in M0. +* Nops take up resources like any plain instructions. + + +ASSEMBLY SYNTAX + +.align pads with nops in a text segment, but gas 2.14 and earlier +incorrectly byte-swaps its nop bundle in big endian mode (eg. hpux), making +it come out as break instructions. We use the ALIGN() macro in +mpn/ia64/ia64-defs.m4 when it might be executed across. That macro +suppresses any .align if the problem is detected by configure. Lack of +alignment might hurt performance but will at least be correct. + +foo:: to create a global symbol is not accepted by gas. Use separate +".global foo" and "foo:" instead. + +.global is the standard global directive. gas accepts .globl, but hpux "as" +doesn't. + +.proc / .endp generates the appropriate .type and .size information for ELF, +so the latter directives don't need to be given explicitly. + +.pred.rel "mutex"... is standard for annotating predicate register +relationships. gas also accepts .pred.rel.mutex, but hpux "as" doesn't. + +.pred directives can't be put on a line with a label, like +".Lfoo: .pred ...", the HP assembler on HP-UX 11.23 rejects that. +gas is happy with it, and past versions of HP had seemed ok. + +// is the standard comment sequence, but we prefer "C" since it inhibits m4 +macro expansion. See comments in ia64-defs.m4. + + +REGISTER USAGE + +Special: + r0: constant 0 + r1: global pointer (gp) + r8: return value + r12: stack pointer (sp) + r13: thread pointer (tp) +Caller-saves: r8-r11 r14-r31 f6-f15 f32-f127 +Caller-saves but rotating: r32- + + +================================================================ +mpn_add_n, mpn_sub_n: + +The current code runs at 1.25 c/l on Itanium 2. + +================================================================ +mpn_mul_1: + +The current code runs at 2 c/l on Itanium 2. + +Using a blocked approach, working off of 4 separate places in the operands, +one could make use of the xma accumulation, and approach 1 c/l. + + ldf8 [up] + xma.l + xma.hu + stf8 [wrp] + +================================================================ +mpn_addmul_1: + +The current code runs at 2 c/l on Itanium 2. + +It seems possible to use a blocked approach, as with mpn_mul_1. We should +read rp[] to integer registers, allowing for just one getf.sig per cycle. + + ld8 [rp] + ldf8 [up] + xma.l + xma.hu + getf.sig + add+add+cmp+cmp + st8 [wrp] + +These 10 instructions can be scheduled to approach 1.667 cycles, and with +the 4 cycle latency of xma, this means we need at least 3 blocks. Using +ldfp8 we could approach 1.583 c/l. + +================================================================ +mpn_submul_1: + +The current code runs at 2.25 c/l on Itanium 2. Getting to 2 c/l requires +ldfp8 with all alignment headache that implies. + +================================================================ +mpn_addmul_N + +For best speed, we need to give up using mpn_addmul_2 as the main multiply +building block, and instead take multiple v limbs per loop. For the Itanium +1, we need to take about 8 limbs at a time for full speed. For the Itanium +2, something like mpn_addmul_4 should be enough. + +The add+cmp+cmp+add we use on the other codes is optimal for shortening +recurrencies (1 cycle) but the sequence takes up 4 execution slots. When +recurrency depth is not critical, a more standard 3-cycle add+cmp+add is +better. + +/* First load the 8 values from v */ + ldfp8 v0, v1 = [r35], 16;; + ldfp8 v2, v3 = [r35], 16;; + ldfp8 v4, v5 = [r35], 16;; + ldfp8 v6, v7 = [r35], 16;; + +/* In the inner loop, get a new U limb and store a result limb. */ + mov lc = un +Loop: ldf8 u0 = [r33], 8 + ld8 r0 = [r32] + xma.l lp0 = v0, u0, hp0 + xma.hu hp0 = v0, u0, hp0 + xma.l lp1 = v1, u0, hp1 + xma.hu hp1 = v1, u0, hp1 + xma.l lp2 = v2, u0, hp2 + xma.hu hp2 = v2, u0, hp2 + xma.l lp3 = v3, u0, hp3 + xma.hu hp3 = v3, u0, hp3 + xma.l lp4 = v4, u0, hp4 + xma.hu hp4 = v4, u0, hp4 + xma.l lp5 = v5, u0, hp5 + xma.hu hp5 = v5, u0, hp5 + xma.l lp6 = v6, u0, hp6 + xma.hu hp6 = v6, u0, hp6 + xma.l lp7 = v7, u0, hp7 + xma.hu hp7 = v7, u0, hp7 + getf.sig l0 = lp0 + getf.sig l1 = lp1 + getf.sig l2 = lp2 + getf.sig l3 = lp3 + getf.sig l4 = lp4 + getf.sig l5 = lp5 + getf.sig l6 = lp6 + add+cmp+add xx, l0, r0 + add+cmp+add acc0, acc1, l1 + add+cmp+add acc1, acc2, l2 + add+cmp+add acc2, acc3, l3 + add+cmp+add acc3, acc4, l4 + add+cmp+add acc4, acc5, l5 + add+cmp+add acc5, acc6, l6 + getf.sig acc6 = lp7 + st8 [r32] = xx, 8 + br.cloop Loop + + 49 insn at max 6 insn/cycle: 8.167 cycles/limb8 + 11 memops at max 2 memops/cycle: 5.5 cycles/limb8 + 16 fpops at max 2 fpops/cycle: 8 cycles/limb8 + 21 intops at max 4 intops/cycle: 5.25 cycles/limb8 + 11+21 memops+intops at max 4/cycle 8 cycles/limb8 + +================================================================ +mpn_lshift, mpn_rshift + +The current code runs at 1 cycle/limb on Itanium 2. + +Using 63 separate loops, we could use the double-word shrp instruction. +That instruction has a plain single-cycle latency. We need 63 loops since +this instruction only accept immediate count. That would lead to a somewhat +silly code size, but the speed would be 0.75 c/l on Itanium 2 (by using shrp +each cycle plus shl/shr going down I1 for a further limb every second +cycle). + +================================================================ +mpn_copyi, mpn_copyd + +The current code runs at 0.5 c/l on Itanium 2. But that is just for L1 +cache hit. The 4-way unrolled loop takes just 2 cycles, and thus load-use +scheduling isn't great. It might be best to actually use modulo scheduled +loops, since that will allow us to do better load-use scheduling without too +much unrolling. + +Depending on size or operand alignment, we get 1 c/l or 0.5 c/l on Itanium +2, according to tune/speed. Cache bank conflicts? + + + +REFERENCES + +Intel Itanium Architecture Software Developer's Manual, volumes 1 to 3, +Intel document 245317-004, 245318-004, 245319-004 October 2002. Volume 1 +includes an Itanium optimization guide. + +Intel Itanium Processor-specific Application Binary Interface (ABI), Intel +document 245370-003, May 2001. Describes C type sizes, dynamic linking, +etc. + +Intel Itanium Architecture Assembly Language Reference Guide, Intel document +248801-004, 2000-2002. Describes assembly instruction syntax and other +directives. + +Itanium Software Conventions and Runtime Architecture Guide, Intel document +245358-003, May 2001. Describes calling conventions, including stack +unwinding requirements. + +Intel Itanium Processor Reference Manual for Software Optimization, Intel +document 245473-003, November 2001. + +Intel Itanium-2 Processor Reference Manual for Software Development and +Optimization, Intel document 251110-003, May 2004. + +All the above documents can be found online at + + http://developer.intel.com/design/itanium/manuals.htm diff --git a/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm b/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm new file mode 100644 index 0000000..c15afaa --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/add_n_sub_n.asm @@ -0,0 +1,307 @@ +dnl IA-64 mpn_add_n_sub_n -- mpn parallel addition and subtraction. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 2.25 + +C INPUT PARAMETERS +define(`sp', `r32') +define(`dp', `r33') +define(`up', `r34') +define(`vp', `r35') +define(`n', `r36') + +C Some useful aliases for registers we use +define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') +define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23') +define(`s0',`r24') define(`s1',`r25') define(`s2',`r26') define(`s3',`r27') +define(`d0',`r28') define(`d1',`r29') define(`d2',`r30') define(`d3',`r31') +define(`up0',`up') +define(`up1',`r14') +define(`vp0',`vp') +define(`vp1',`r15') + + +ASM_START() +PROLOGUE(mpn_add_n_sub_n) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 sp = 0, sp C M I + addp4 dp = 0, dp C M I + nop.i 0 + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I + zxt4 n = n C I + ;; +') + + and r9 = 3, n C M I + mov.i r2 = ar.lc C I0 + add up1 = 8, up0 C M I + add vp1 = 8, vp0 C M I + add r8 = -2, n C M I + add r10 = 256, up C M I + ;; + shr.u r8 = r8, 2 C I0 + cmp.eq p10, p0 = 0, r9 C M I + cmp.eq p11, p0 = 2, r9 C M I + cmp.eq p12, p0 = 3, r9 C M I + add r11 = 256, vp C M I + ;; + mov.i ar.lc = r8 C I0 + (p10) br L(b0) C B + (p11) br L(b2) C B + (p12) br L(b3) C B + +L(b1): ld8 u3 = [up0], 8 C M01 + add up1 = 8, up1 C M I + cmpltu p14, p15 = 4, n C M I + ld8 v3 = [vp0], 8 C M01 + add vp1 = 8, vp1 C M I + ;; + add s3 = u3, v3 C M I + sub d3 = u3, v3 C M I + mov r8 = 0 C M I + ;; + cmpltu p9, p0 = s3, v3 C carry from add3 M I + cmpltu p13, p0 = u3, v3 C borrow from sub3 M I + (p15) br L(cj1) C B + st8 [sp] = s3, 8 C M23 + st8 [dp] = d3, 8 C M23 + br L(c0) C B + +L(b0): cmp.ne p9, p0 = r0, r0 C M I + cmp.ne p13, p0 = r0, r0 C M I +L(c0): ld8 u0 = [up0], 16 C M01 + ld8 u1 = [up1], 16 C M01 + ;; + ld8 v0 = [vp0], 16 C M01 + ld8 v1 = [vp1], 16 C M01 + ;; + ld8 u2 = [up0], 16 C M01 + ld8 u3 = [up1], 16 C M01 + ;; + ld8 v2 = [vp0], 16 C M01 + ld8 v3 = [vp1], 16 C M01 + ;; + add s0 = u0, v0 C M I + add s1 = u1, v1 C M I + sub d0 = u0, v0 C M I + sub d1 = u1, v1 C M I + ;; + cmpltu p6, p0 = s0, v0 C carry from add0 M I + cmpltu p7, p0 = s1, v1 C carry from add1 M I + cmpltu p10, p0 = u0, v0 C borrow from sub0 M I + cmpltu p11, p0 = u1, v1 C borrow from sub1 M I + ;; + nop 0 C + br.cloop.dptk L(top) C B + br L(end) C B + +L(b3): ld8 u1 = [up0], 8 C M01 + add up1 = 8, up1 C M I + ld8 v1 = [vp0], 8 C M01 + ;; + add vp1 = 8, vp1 C M I + add s1 = u1, v1 C M I + sub d1 = u1, v1 C M I + ;; + cmpltu p7, p0 = s1, v1 C carry from add1 M I + cmpltu p11, p0 = u1, v1 C borrow from sub1 M I + ;; + st8 [sp] = s1, 8 C M23 + st8 [dp] = d1, 8 C M23 + br L(c2) C B + + ALIGN(32) +L(b2): cmp.ne p7, p0 = r0, r0 C M I + cmp.ne p11, p0 = r0, r0 C M I + nop 0 +L(c2): ld8 u2 = [up0], 16 C M01 + ld8 u3 = [up1], 16 C M01 + cmpltu p14, p0 = 4, n C M I + ;; + ld8 v2 = [vp0], 16 C M01 + ld8 v3 = [vp1], 16 C M01 + (p14) br L(gt4) C B + ;; + add s2 = u2, v2 C M I + add s3 = u3, v3 C M I + sub d2 = u2, v2 C M I + sub d3 = u3, v3 C M I + ;; + cmpltu p8, p0 = s2, v2 C carry from add0 M I + cmpltu p9, p0 = s3, v3 C carry from add3 M I + cmpltu p12, p0 = u2, v2 C borrow from sub2 M I + cmpltu p13, p0 = u3, v3 C borrow from sub3 M I + br L(cj2) C B + ;; +L(gt4): ld8 u0 = [up0], 16 C M01 + ld8 u1 = [up1], 16 C M01 + ;; + ld8 v0 = [vp0], 16 C M01 + ld8 v1 = [vp1], 16 C M01 + ;; + add s2 = u2, v2 C M I + add s3 = u3, v3 C M I + sub d2 = u2, v2 C M I + sub d3 = u3, v3 C M I + ;; + cmpltu p8, p0 = s2, v2 C carry from add0 M I + cmpltu p9, p0 = s3, v3 C carry from add1 M I + cmpltu p12, p0 = u2, v2 C borrow from sub0 M I + cmpltu p13, p0 = u3, v3 C borrow from sub1 M I + br.cloop.dptk L(mid) C B + + ALIGN(32) +L(top): + ld8 u0 = [up0], 16 C M01 + ld8 u1 = [up1], 16 C M01 + (p9) cmpeqor p6, p0 = -1, s0 C M I + (p9) add s0 = 1, s0 C M I + (p13) cmpeqor p10, p0 = 0, d0 C M I + (p13) add d0 = -1, d0 C M I + ;; + ld8 v0 = [vp0], 16 C M01 + ld8 v1 = [vp1], 16 C M01 + (p6) cmpeqor p7, p0 = -1, s1 C M I + (p6) add s1 = 1, s1 C M I + (p10) cmpeqor p11, p0 = 0, d1 C M I + (p10) add d1 = -1, d1 C M I + ;; + st8 [sp] = s0, 8 C M23 + st8 [dp] = d0, 8 C M23 + add s2 = u2, v2 C M I + add s3 = u3, v3 C M I + sub d2 = u2, v2 C M I + sub d3 = u3, v3 C M I + ;; + st8 [sp] = s1, 8 C M23 + st8 [dp] = d1, 8 C M23 + cmpltu p8, p0 = s2, v2 C carry from add2 M I + cmpltu p9, p0 = s3, v3 C carry from add3 M I + cmpltu p12, p0 = u2, v2 C borrow from sub2 M I + cmpltu p13, p0 = u3, v3 C borrow from sub3 M I + ;; +L(mid): + ld8 u2 = [up0], 16 C M01 + ld8 u3 = [up1], 16 C M01 + (p7) cmpeqor p8, p0 = -1, s2 C M I + (p7) add s2 = 1, s2 C M I + (p11) cmpeqor p12, p0 = 0, d2 C M I + (p11) add d2 = -1, d2 C M I + ;; + ld8 v2 = [vp0], 16 C M01 + ld8 v3 = [vp1], 16 C M01 + (p8) cmpeqor p9, p0 = -1, s3 C M I + (p8) add s3 = 1, s3 C M I + (p12) cmpeqor p13, p0 = 0, d3 C M I + (p12) add d3 = -1, d3 C M I + ;; + st8 [sp] = s2, 8 C M23 + st8 [dp] = d2, 8 C M23 + add s0 = u0, v0 C M I + add s1 = u1, v1 C M I + sub d0 = u0, v0 C M I + sub d1 = u1, v1 C M I + ;; + st8 [sp] = s3, 8 C M23 + st8 [dp] = d3, 8 C M23 + cmpltu p6, p0 = s0, v0 C carry from add0 M I + cmpltu p7, p0 = s1, v1 C carry from add1 M I + cmpltu p10, p0 = u0, v0 C borrow from sub0 M I + cmpltu p11, p0 = u1, v1 C borrow from sub1 M I + ;; + lfetch [r10], 32 C M? + lfetch [r11], 32 C M? + br.cloop.dptk L(top) C B + ;; + +L(end): + nop 0 + nop 0 + (p9) cmpeqor p6, p0 = -1, s0 C M I + (p9) add s0 = 1, s0 C M I + (p13) cmpeqor p10, p0 = 0, d0 C M I + (p13) add d0 = -1, d0 C M I + ;; + nop 0 + nop 0 + (p6) cmpeqor p7, p0 = -1, s1 C M I + (p6) add s1 = 1, s1 C M I + (p10) cmpeqor p11, p0 = 0, d1 C M I + (p10) add d1 = -1, d1 C M I + ;; + st8 [sp] = s0, 8 C M23 + st8 [dp] = d0, 8 C M23 + add s2 = u2, v2 C M I + add s3 = u3, v3 C M I + sub d2 = u2, v2 C M I + sub d3 = u3, v3 C M I + ;; + st8 [sp] = s1, 8 C M23 + st8 [dp] = d1, 8 C M23 + cmpltu p8, p0 = s2, v2 C carry from add2 M I + cmpltu p9, p0 = s3, v3 C carry from add3 M I + cmpltu p12, p0 = u2, v2 C borrow from sub2 M I + cmpltu p13, p0 = u3, v3 C borrow from sub3 M I + ;; +L(cj2): + (p7) cmpeqor p8, p0 = -1, s2 C M I + (p7) add s2 = 1, s2 C M I + (p11) cmpeqor p12, p0 = 0, d2 C M I + (p11) add d2 = -1, d2 C M I + mov r8 = 0 C M I + nop 0 + ;; + st8 [sp] = s2, 8 C M23 + st8 [dp] = d2, 8 C M23 + (p8) cmpeqor p9, p0 = -1, s3 C M I + (p8) add s3 = 1, s3 C M I + (p12) cmpeqor p13, p0 = 0, d3 C M I + (p12) add d3 = -1, d3 C M I + ;; +L(cj1): + (p9) mov r8 = 2 C M I + ;; + mov.i ar.lc = r2 C I0 + (p13) add r8 = 1, r8 C M I + st8 [sp] = s3 C M23 + st8 [dp] = d3 C M23 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/addmul_1.asm b/gmp-6.3.0/mpn/ia64/addmul_1.asm new file mode 100644 index 0000000..ffa3297 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/addmul_1.asm @@ -0,0 +1,602 @@ +dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 3.0 +C Itanium 2: 2.0 + +C TODO +C * Further optimize feed-in and wind-down code, both for speed and code size. +C * Handle low limb input and results specially, using a common stf8 in the +C epilogue. +C * Use 1 c/l carry propagation scheme in wind-down code. +C * Use extra pointer registers for `up' and rp to speed up feed-in loads. +C * Work out final differences with mul_1.asm. That function is 300 bytes +C smaller than this due to better loop scheduling and thus simpler feed-in +C code. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`vl', `r35') + +ASM_START() +PROLOGUE(mpn_addmul_1) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmi + adds r15 = -1, n C M I + mov r20 = rp C M I + mov.i r2 = ar.lc C I0 +} +{.mmi + ldf8 f7 = [up], 8 C M + ldf8 f8 = [rp], 8 C M + and r14 = 3, n C M I + ;; +} +{.mmi + setf.sig f6 = vl C M2 M3 + cmp.eq p10, p0 = 0, r14 C M I + shr.u r31 = r15, 2 C I0 +} +{.mmi + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + nop.i 0 C I + ;; +} +{.mii + cmp.ne p6, p7 = r0, r0 C M I + mov.i ar.lc = r31 C I0 + cmp.ne p8, p9 = r0, r0 C M I +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: br.cloop.dptk .grt1 C B + + xma.l f39 = f7, f6, f8 C F + xma.hu f43 = f7, f6, f8 C F + ;; + getf.sig r8 = f43 C M2 + stf8 [r20] = f39 C M2 M3 + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B + +.grt1: + ldf8 f32 = [up], 8 + ldf8 f44 = [rp], 8 + ;; + ldf8 f33 = [up], 8 + ldf8 f45 = [rp], 8 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f7, f6, f8 + ldf8 f46 = [rp], 8 + xma.hu f43 = f7, f6, f8 + ;; + ldf8 f35 = [up], 8 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt5 + + xma.l f36 = f32, f6, f44 + xma.hu f40 = f32, f6, f44 + ;; + stf8 [r20] = f39, 8 + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 + getf.sig r24 = f36 + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 + getf.sig r25 = f37 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 + getf.sig r26 = f38 + br .Lcj5 + +.grt5: + mov r30 = 0 + xma.l f36 = f32, f6, f44 + xma.hu f40 = f32, f6, f44 + ;; + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f45 + ldf8 f44 = [rp], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f33 = [up], 8 + getf.sig r27 = f39 + ;; + getf.sig r31 = f43 + xma.l f38 = f34, f6, f46 + ldf8 f45 = [rp], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f34 = [up], 8 + getf.sig r24 = f36 + ;; + getf.sig r28 = f40 + xma.l f39 = f35, f6, f47 + ldf8 f46 = [rp], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f35 = [up], 8 + getf.sig r25 = f37 + br.cloop.dptk .Loop + br .Le0 + + +.Lb10: ldf8 f35 = [up], 8 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt2 + + xma.l f38 = f7, f6, f8 + xma.hu f42 = f7, f6, f8 + ;; + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r30 = f42 + stf8 [r20] = f38, 8 + getf.sig r27 = f39 + getf.sig r8 = f43 + br .Lcj2 + +.grt2: + ldf8 f32 = [up], 8 + ldf8 f44 = [rp], 8 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f7, f6, f8 + ldf8 f45 = [rp], 8 + xma.hu f42 = f7, f6, f8 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + ldf8 f46 = [rp], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f35 = [up], 8 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt6 + + stf8 [r20] = f38, 8 + xma.l f36 = f32, f6, f44 + xma.hu f40 = f32, f6, f44 + ;; + getf.sig r30 = f42 + getf.sig r27 = f39 + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 + getf.sig r24 = f36 + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 + getf.sig r25 = f37 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + br .Lcj6 + +.grt6: + mov r29 = 0 + xma.l f36 = f32, f6, f44 + xma.hu f40 = f32, f6, f44 + ;; + ldf8 f32 = [up], 8 + getf.sig r26 = f38 + ;; + getf.sig r30 = f42 + xma.l f37 = f33, f6, f45 + ldf8 f44 = [rp], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f33 = [up], 8 + getf.sig r27 = f39 + ;; + getf.sig r31 = f43 + xma.l f38 = f34, f6, f46 + ldf8 f45 = [rp], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f34 = [up], 8 + getf.sig r24 = f36 + br .LL10 + + +.Lb11: ldf8 f34 = [up], 8 + ldf8 f46 = [rp], 8 + ;; + ldf8 f35 = [up], 8 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt3 + ;; + + xma.l f37 = f7, f6, f8 + xma.hu f41 = f7, f6, f8 + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 + stf8 [r20] = f37, 8 + getf.sig r26 = f38 + getf.sig r30 = f42 + getf.sig r27 = f39 + getf.sig r8 = f43 + br .Lcj3 + +.grt3: + ldf8 f32 = [up], 8 + xma.l f37 = f7, f6, f8 + ldf8 f44 = [rp], 8 + xma.hu f41 = f7, f6, f8 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f46 + ldf8 f45 = [rp], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + ldf8 f46 = [rp], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f35 = [up], 8 + getf.sig r25 = f37 C FIXME + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt7 + + getf.sig r29 = f41 + stf8 [r20] = f37, 8 C FIXME + xma.l f36 = f32, f6, f44 + getf.sig r26 = f38 + xma.hu f40 = f32, f6, f44 + ;; + getf.sig r30 = f42 + xma.l f37 = f33, f6, f45 + getf.sig r27 = f39 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 + xma.l f38 = f34, f6, f46 + getf.sig r24 = f36 + xma.hu f42 = f34, f6, f46 + br .Lcj7 + +.grt7: + getf.sig r29 = f41 + xma.l f36 = f32, f6, f44 + mov r28 = 0 + xma.hu f40 = f32, f6, f44 + ;; + ldf8 f32 = [up], 8 + getf.sig r26 = f38 + ;; + getf.sig r30 = f42 + xma.l f37 = f33, f6, f45 + ldf8 f44 = [rp], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f33 = [up], 8 + getf.sig r27 = f39 + br .LL11 + + +.Lb00: ldf8 f33 = [up], 8 + ldf8 f45 = [rp], 8 + ;; + ldf8 f34 = [up], 8 + ldf8 f46 = [rp], 8 + ;; + ldf8 f35 = [up], 8 + xma.l f36 = f7, f6, f8 + ldf8 f47 = [rp], 8 + xma.hu f40 = f7, f6, f8 + br.cloop.dptk .grt4 + + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 + stf8 [r20] = f36, 8 + xma.l f39 = f35, f6, f47 + getf.sig r25 = f37 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 + getf.sig r26 = f38 + getf.sig r30 = f42 + getf.sig r27 = f39 + br .Lcj4 + +.grt4: + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f45 + ldf8 f44 = [rp], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f46 + ldf8 f45 = [rp], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f34 = [up], 8 + getf.sig r24 = f36 C FIXME + xma.l f39 = f35, f6, f47 + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f35 = [up], 8 + getf.sig r25 = f37 + ldf8 f47 = [rp], 8 + br.cloop.dptk .grt8 + + getf.sig r29 = f41 + stf8 [r20] = f36, 8 C FIXME + xma.l f36 = f32, f6, f44 + getf.sig r26 = f38 + getf.sig r30 = f42 + xma.hu f40 = f32, f6, f44 + ;; + xma.l f37 = f33, f6, f45 + getf.sig r27 = f39 + xma.hu f41 = f33, f6, f45 + br .Lcj8 + +.grt8: + getf.sig r29 = f41 + xma.l f36 = f32, f6, f44 + mov r31 = 0 + xma.hu f40 = f32, f6, f44 + ;; + ldf8 f32 = [up], 8 + getf.sig r26 = f38 + br .LL00 + + +C *** MAIN LOOP START *** + ALIGN(32) C insn fed cycle # +.Loop: + .pred.rel "mutex", p6, p7 C num by i1 i2 + getf.sig r29 = f41 C 00 16 0 0 + xma.l f36 = f32, f6, f44 C 01 06,15 0 0 + (p6) add r14 = r30, r27, 1 C 02 0 0 + ldf8 f47 = [rp], 8 C 03 0 0 + xma.hu f40 = f32, f6, f44 C 04 06,15 0 0 + (p7) add r14 = r30, r27 C 05 0 0 + ;; + .pred.rel "mutex", p6, p7 + ldf8 f32 = [up], 8 C 06 1 1 + (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1 + (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1 + getf.sig r26 = f38 C 09 25 2 1 + st8 [r20] = r14, 8 C 10 2 1 + nop.b 0 C 11 2 1 + ;; +.LL00: + .pred.rel "mutex", p8, p9 + getf.sig r30 = f42 C 12 28 3 2 + xma.l f37 = f33, f6, f45 C 13 18,27 3 2 + (p8) add r16 = r31, r24, 1 C 14 3 2 + ldf8 f44 = [rp], 8 C 15 3 2 + xma.hu f41 = f33, f6, f45 C 16 18,27 3 2 + (p9) add r16 = r31, r24 C 17 3 2 + ;; + .pred.rel "mutex", p8, p9 + ldf8 f33 = [up], 8 C 18 4 3 + (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3 + (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3 + getf.sig r27 = f39 C 21 37 5 3 + st8 [r20] = r16, 8 C 22 5 3 + nop.b 0 C 23 5 3 + ;; +.LL11: + .pred.rel "mutex", p6, p7 + getf.sig r31 = f43 C 24 40 6 4 + xma.l f38 = f34, f6, f46 C 25 30,39 6 4 + (p6) add r14 = r28, r25, 1 C 26 6 4 + ldf8 f45 = [rp], 8 C 27 6 4 + xma.hu f42 = f34, f6, f46 C 28 30,39 6 4 + (p7) add r14 = r28, r25 C 29 6 4 + ;; + .pred.rel "mutex", p6, p7 + ldf8 f34 = [up], 8 C 30 7 5 + (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5 + (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5 + getf.sig r24 = f36 C 33 01 8 5 + st8 [r20] = r14, 8 C 34 8 5 + nop.b 0 C 35 8 5 + ;; +.LL10: + .pred.rel "mutex", p8, p9 + getf.sig r28 = f40 C 36 04 9 6 + xma.l f39 = f35, f6, f47 C 37 42,03 9 6 + (p8) add r16 = r29, r26, 1 C 38 9 6 + ldf8 f46 = [rp], 8 C 39 9 6 + xma.hu f43 = f35, f6, f47 C 40 42,03 9 6 + (p9) add r16 = r29, r26 C 41 9 6 + ;; + .pred.rel "mutex", p8, p9 + ldf8 f35 = [up], 8 C 42 10 7 + (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7 + (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7 + getf.sig r25 = f37 C 45 13 11 7 + st8 [r20] = r16, 8 C 46 11 7 + br.cloop.dptk .Loop C 47 11 7 +C *** MAIN LOOP END *** + ;; +.Le0: + .pred.rel "mutex", p6, p7 + getf.sig r29 = f41 C + xma.l f36 = f32, f6, f44 C + (p6) add r14 = r30, r27, 1 C + ldf8 f47 = [rp], 8 C + xma.hu f40 = f32, f6, f44 C + (p7) add r14 = r30, r27 C + ;; + .pred.rel "mutex", p6, p7 + (p6) cmp.leu p8, p9 = r14, r27 C + (p7) cmp.ltu p8, p9 = r14, r27 C + getf.sig r26 = f38 C + st8 [r20] = r14, 8 C + ;; + .pred.rel "mutex", p8, p9 + getf.sig r30 = f42 C + xma.l f37 = f33, f6, f45 C + (p8) add r16 = r31, r24, 1 C + xma.hu f41 = f33, f6, f45 C + (p9) add r16 = r31, r24 C + ;; + .pred.rel "mutex", p8, p9 + (p8) cmp.leu p6, p7 = r16, r24 C + (p9) cmp.ltu p6, p7 = r16, r24 C + getf.sig r27 = f39 C + st8 [r20] = r16, 8 C + ;; +.Lcj8: + .pred.rel "mutex", p6, p7 + getf.sig r31 = f43 C + xma.l f38 = f34, f6, f46 C + (p6) add r14 = r28, r25, 1 C + xma.hu f42 = f34, f6, f46 C + (p7) add r14 = r28, r25 C + ;; + .pred.rel "mutex", p6, p7 + (p6) cmp.leu p8, p9 = r14, r25 C + (p7) cmp.ltu p8, p9 = r14, r25 C + getf.sig r24 = f36 C + st8 [r20] = r14, 8 C + ;; +.Lcj7: + .pred.rel "mutex", p8, p9 + getf.sig r28 = f40 C + xma.l f39 = f35, f6, f47 C + (p8) add r16 = r29, r26, 1 C + xma.hu f43 = f35, f6, f47 C + (p9) add r16 = r29, r26 C + ;; + .pred.rel "mutex", p8, p9 + (p8) cmp.leu p6, p7 = r16, r26 C + (p9) cmp.ltu p6, p7 = r16, r26 C + getf.sig r25 = f37 C + st8 [r20] = r16, 8 C + ;; +.Lcj6: + .pred.rel "mutex", p6, p7 + getf.sig r29 = f41 C + (p6) add r14 = r30, r27, 1 C + (p7) add r14 = r30, r27 C + ;; + .pred.rel "mutex", p6, p7 + (p6) cmp.leu p8, p9 = r14, r27 C + (p7) cmp.ltu p8, p9 = r14, r27 C + getf.sig r26 = f38 C + st8 [r20] = r14, 8 C + ;; +.Lcj5: + .pred.rel "mutex", p8, p9 + getf.sig r30 = f42 C + (p8) add r16 = r31, r24, 1 C + (p9) add r16 = r31, r24 C + ;; + .pred.rel "mutex", p8, p9 + (p8) cmp.leu p6, p7 = r16, r24 C + (p9) cmp.ltu p6, p7 = r16, r24 C + getf.sig r27 = f39 C + st8 [r20] = r16, 8 C + ;; +.Lcj4: + .pred.rel "mutex", p6, p7 + getf.sig r8 = f43 C + (p6) add r14 = r28, r25, 1 C + (p7) add r14 = r28, r25 C + ;; + .pred.rel "mutex", p6, p7 + st8 [r20] = r14, 8 C + (p6) cmp.leu p8, p9 = r14, r25 C + (p7) cmp.ltu p8, p9 = r14, r25 C + ;; +.Lcj3: + .pred.rel "mutex", p8, p9 + (p8) add r16 = r29, r26, 1 C + (p9) add r16 = r29, r26 C + ;; + .pred.rel "mutex", p8, p9 + st8 [r20] = r16, 8 C + (p8) cmp.leu p6, p7 = r16, r26 C + (p9) cmp.ltu p6, p7 = r16, r26 C + ;; +.Lcj2: + .pred.rel "mutex", p6, p7 + (p6) add r14 = r30, r27, 1 C + (p7) add r14 = r30, r27 C + ;; + .pred.rel "mutex", p6, p7 + st8 [r20] = r14 C + (p6) cmp.leu p8, p9 = r14, r27 C + (p7) cmp.ltu p8, p9 = r14, r27 C + ;; + (p8) add r8 = 1, r8 C M I + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/addmul_2.asm b/gmp-6.3.0/mpn/ia64/addmul_2.asm new file mode 100644 index 0000000..86e8de4 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/addmul_2.asm @@ -0,0 +1,715 @@ +dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and +dnl add the result to a (n+1)-limb number. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2004, 2005, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 3.65 +C Itanium 2: 1.625 + +C TODO +C * Clean up variable names, and try to decrease the number of distinct +C registers used. +C * Clean up feed-in code to not require zeroing several registers. +C * Make sure we don't depend on uninitialised predicate registers. +C * Could perhaps save a few cycles by using 1 c/l carry propagation in +C wind-down code. +C * Ultimately rewrite. The problem with this code is that it first uses a +C loaded u value in one xma pair, then leaves it live over several unrelated +C xma pairs, before it uses it again. It should actually be quite possible +C to just swap some aligned xma pairs around. But we should then schedule +C u loads further from the first use. + +C INPUT PARAMETERS +define(`rp',`r32') +define(`up',`r33') +define(`n',`r34') +define(`vp',`r35') + +define(`srp',`r3') + +define(`v0',`f6') +define(`v1',`f7') + +define(`s0',`r14') +define(`acc0',`r15') + +define(`pr0_0',`r16') define(`pr0_1',`r17') +define(`pr0_2',`r18') define(`pr0_3',`r19') + +define(`pr1_0',`r20') define(`pr1_1',`r21') +define(`pr1_2',`r22') define(`pr1_3',`r23') + +define(`acc1_0',`r24') define(`acc1_1',`r25') +define(`acc1_2',`r26') define(`acc1_3',`r27') + +dnl define(`',`r28') +dnl define(`',`r29') +dnl define(`',`r30') +dnl define(`',`r31') + +define(`fp0b_0',`f8') define(`fp0b_1',`f9') +define(`fp0b_2',`f10') define(`fp0b_3',`f11') + +define(`fp1a_0',`f12') define(`fp1a_1',`f13') +define(`fp1a_2',`f14') define(`fp1a_3',`f15') + +define(`fp1b_0',`f32') define(`fp1b_1',`f33') +define(`fp1b_2',`f34') define(`fp1b_3',`f35') + +define(`fp2a_0',`f36') define(`fp2a_1',`f37') +define(`fp2a_2',`f38') define(`fp2a_3',`f39') + +define(`r_0',`f40') define(`r_1',`f41') +define(`r_2',`f42') define(`r_3',`f43') + +define(`u_0',`f44') define(`u_1',`f45') +define(`u_2',`f46') define(`u_3',`f47') + +define(`rx',`f48') +define(`ux',`f49') +define(`ry',`f50') +define(`uy',`f51') + +ASM_START() +PROLOGUE(mpn_addmul_2s) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32',` + {.mmi; addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I +}{.mmi; nop 1 + nop 1 + zxt4 n = n C I + ;; +}') + + {.mmi; ldf8 ux = [up], 8 C M + ldf8 v0 = [vp], 8 C M + mov r2 = ar.lc C I0 +}{.mmi; ldf8 rx = [rp], 8 C M + and r14 = 3, n C M I + add n = -2, n C M I + ;; +}{.mmi; ldf8 uy = [up], 8 C M + ldf8 v1 = [vp] C M + shr.u n = n, 2 C I0 +}{.mmi; ldf8 ry = [rp], -8 C M + cmp.eq p14, p0 = 1, r14 C M I + cmp.eq p11, p0 = 2, r14 C M I + ;; +}{.mmi; add srp = 16, rp C M I + cmp.eq p15, p0 = 3, r14 C M I + mov ar.lc = n C I0 +}{.bbb; (p14) br.dptk L(x01) C B + (p11) br.dptk L(x10) C B + (p15) br.dptk L(x11) C B + ;; +} +L(x00): cmp.ne p6, p0 = r0, r0 C suppress initial xma pair + mov fp2a_3 = f0 + br L(b00) +L(x01): cmp.ne p14, p0 = r0, r0 C suppress initial xma pair + mov fp2a_2 = f0 + br L(b01) +L(x10): cmp.ne p11, p0 = r0, r0 C suppress initial xma pair + mov fp2a_1 = f0 + br L(b10) +L(x11): cmp.ne p15, p0 = r0, r0 C suppress initial xma pair + mov fp2a_0 = f0 + br L(b11) + +EPILOGUE() + +PROLOGUE(mpn_addmul_2) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32',` + {.mmi; addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I +}{.mmi; nop 1 + nop 1 + zxt4 n = n C I + ;; +}') + + {.mmi; ldf8 ux = [up], 8 C M + ldf8 v0 = [vp], 8 C M + mov r2 = ar.lc C I0 +}{.mmi; ldf8 rx = [rp], 8 C M + and r14 = 3, n C M I + add n = -2, n C M I + ;; +}{.mmi; ldf8 uy = [up], 8 C M + ldf8 v1 = [vp] C M + shr.u n = n, 2 C I0 +}{.mmi; ldf8 ry = [rp], -8 C M + cmp.eq p14, p0 = 1, r14 C M I + cmp.eq p11, p0 = 2, r14 C M I + ;; +}{.mmi; add srp = 16, rp C M I + cmp.eq p15, p6 = 3, r14 C M I + mov ar.lc = n C I0 +}{.bbb; (p14) br.dptk L(b01) C B + (p11) br.dptk L(b10) C B + (p15) br.dptk L(b11) C B + ;; +} + ALIGN(32) +L(b00): + {.mmi; ldf8 r_1 = [srp], 8 + ldf8 u_1 = [up], 8 + mov acc1_2 = 0 +}{.mmi; mov pr1_2 = 0 + mov pr0_3 = 0 + cmp.ne p8, p9 = r0, r0 + ;; +}{.mfi; ldf8 r_2 = [srp], 8 + xma.l fp0b_3 = ux, v0, rx + cmp.ne p12, p13 = r0, r0 +}{.mfb; ldf8 u_2 = [up], 8 + xma.hu fp1b_3 = ux, v0, rx + br.cloop.dptk L(gt4) +} + xma.l fp0b_0 = uy, v0, ry + xma.hu fp1a_0 = uy, v0, ry + ;; + getfsig acc0 = fp0b_3 + (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s + (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s + ;; + xma.l fp0b_1 = u_1, v0, r_1 + xma.hu fp1a_1 = u_1, v0, r_1 + ;; + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = uy, v1, fp1a_0 + xma.hu fp2a_0 = uy, v1, fp1a_0 + ;; + getfsig pr1_3 = fp1b_3 + getfsig acc1_3 = fp2a_3 + xma.l fp0b_2 = u_2, v0, r_2 + xma.hu fp1a_2 = u_2, v0, r_2 + br L(cj4) + +L(gt4): xma.l fp0b_0 = uy, v0, ry + xma.hu fp1a_0 = uy, v0, ry + ;; + ldf8 r_3 = [srp], 8 + getfsig acc0 = fp0b_3 + (p6) xma.hu fp2a_3 = ux, v1, fp1b_3 C suppressed for addmul_2s + ldf8 u_3 = [up], 8 + (p6) xma.l fp1b_3 = ux, v1, fp1b_3 C suppressed for addmul_2s + ;; + xma.l fp0b_1 = u_1, v0, r_1 + xma.hu fp1a_1 = u_1, v0, r_1 + ;; + ldf8 r_0 = [srp], 8 + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = uy, v1, fp1a_0 + xma.hu fp2a_0 = uy, v1, fp1a_0 + ;; + ldf8 u_0 = [up], 8 + getfsig pr1_3 = fp1b_3 + xma.l fp0b_2 = u_2, v0, r_2 + ;; + getfsig acc1_3 = fp2a_3 + xma.hu fp1a_2 = u_2, v0, r_2 + br L(00) + + + ALIGN(32) +L(b01): + {.mmi; ldf8 r_0 = [srp], 8 C M + ldf8 u_0 = [up], 8 C M + mov acc1_1 = 0 C M I +}{.mmi; mov pr1_1 = 0 C M I + mov pr0_2 = 0 C M I + cmp.ne p6, p7 = r0, r0 C M I + ;; +}{.mfi; ldf8 r_1 = [srp], 8 C M + xma.l fp0b_2 = ux, v0, rx C F + cmp.ne p10, p11 = r0, r0 C M I +}{.mfi; ldf8 u_1 = [up], 8 C M + xma.hu fp1b_2 = ux, v0, rx C F + nop 1 + ;; +} xma.l fp0b_3 = uy, v0, ry C F + xma.hu fp1a_3 = uy, v0, ry C F + ;; + {.mmf; getfsig acc0 = fp0b_2 C M + ldf8 r_2 = [srp], 8 C M + (p14) xma.hu fp2a_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s +}{.mfb; ldf8 u_2 = [up], 8 C M + (p14) xma.l fp1b_2 = ux, v1,fp1b_2 C F suppressed for addmul_2s + br.cloop.dptk L(gt5) +} + xma.l fp0b_0 = u_0, v0, r_0 C F + xma.hu fp1a_0 = u_0, v0, r_0 C F + ;; + getfsig pr0_3 = fp0b_3 C M + xma.l fp1b_3 = uy, v1,fp1a_3 C F + xma.hu fp2a_3 = uy, v1,fp1a_3 C F + ;; + getfsig pr1_2 = fp1b_2 C M + getfsig acc1_2 = fp2a_2 C M + xma.l fp0b_1 = u_1, v0, r_1 C F + xma.hu fp1a_1 = u_1, v0, r_1 C F + br L(cj5) + +L(gt5): xma.l fp0b_0 = u_0, v0, r_0 + xma.hu fp1a_0 = u_0, v0, r_0 + ;; + getfsig pr0_3 = fp0b_3 + ldf8 r_3 = [srp], 8 + xma.l fp1b_3 = uy, v1, fp1a_3 + xma.hu fp2a_3 = uy, v1, fp1a_3 + ;; + ldf8 u_3 = [up], 8 + getfsig pr1_2 = fp1b_2 + xma.l fp0b_1 = u_1, v0, r_1 + ;; + getfsig acc1_2 = fp2a_2 + xma.hu fp1a_1 = u_1, v0, r_1 + br L(01) + + + ALIGN(32) +L(b10): br.cloop.dptk L(gt2) + xma.l fp0b_1 = ux, v0, rx + xma.hu fp1b_1 = ux, v0, rx + ;; + xma.l fp0b_2 = uy, v0, ry + xma.hu fp1a_2 = uy, v0, ry + ;; + stf8 [rp] = fp0b_1, 8 + (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s + (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s + ;; + getfsig acc0 = fp0b_2 + xma.l fp1b_2 = uy, v1, fp1a_2 + xma.hu fp2a_2 = uy, v1, fp1a_2 + ;; + getfsig pr1_1 = fp1b_1 + getfsig acc1_1 = fp2a_1 + mov ar.lc = r2 + getfsig pr1_2 = fp1b_2 + getfsig r8 = fp2a_2 + ;; + add s0 = pr1_1, acc0 + ;; + st8 [rp] = s0, 8 + cmp.ltu p8, p9 = s0, pr1_1 + sub r31 = -1, acc1_1 + ;; + .pred.rel "mutex", p8, p9 + (p8) add acc0 = pr1_2, acc1_1, 1 + (p9) add acc0 = pr1_2, acc1_1 + (p8) cmp.leu p10, p0 = r31, pr1_2 + (p9) cmp.ltu p10, p0 = r31, pr1_2 + ;; + st8 [rp] = acc0, 8 + (p10) add r8 = 1, r8 + br.ret.sptk.many b0 + + +L(gt2): + {.mmi; ldf8 r_3 = [srp], 8 + ldf8 u_3 = [up], 8 + mov acc1_0 = 0 + ;; +}{.mfi; ldf8 r_0 = [srp], 8 + xma.l fp0b_1 = ux, v0, rx + mov pr1_0 = 0 +}{.mfi; ldf8 u_0 = [up], 8 + xma.hu fp1b_1 = ux, v0, rx + mov pr0_1 = 0 + ;; +} xma.l fp0b_2 = uy, v0, ry + xma.hu fp1a_2 = uy, v0, ry + ;; + getfsig acc0 = fp0b_1 + ldf8 r_1 = [srp], 8 + (p11) xma.hu fp2a_1 = ux, v1, fp1b_1 C suppressed for addmul_2s + (p11) xma.l fp1b_1 = ux, v1, fp1b_1 C suppressed for addmul_2s + ;; + ldf8 u_1 = [up], 8 + xma.l fp0b_3 = u_3, v0, r_3 + xma.hu fp1a_3 = u_3, v0, r_3 + ;; + getfsig pr0_2 = fp0b_2 + ldf8 r_2 = [srp], 8 + xma.l fp1b_2 = uy, v1, fp1a_2 + xma.hu fp2a_2 = uy, v1, fp1a_2 + ;; + ldf8 u_2 = [up], 8 + getfsig pr1_1 = fp1b_1 + ;; + {.mfi; getfsig acc1_1 = fp2a_1 + xma.l fp0b_0 = u_0, v0, r_0 + cmp.ne p8, p9 = r0, r0 +}{.mfb; cmp.ne p12, p13 = r0, r0 + xma.hu fp1a_0 = u_0, v0, r_0 + br.cloop.sptk.clr L(top) +} + br.many L(end) + + + ALIGN(32) +L(b11): ldf8 r_2 = [srp], 8 + mov pr1_3 = 0 + mov pr0_0 = 0 + ;; + ldf8 u_2 = [up], 8 + mov acc1_3 = 0 + br.cloop.dptk L(gt3) + ;; + cmp.ne p6, p7 = r0, r0 + xma.l fp0b_0 = ux, v0, rx + xma.hu fp1b_0 = ux, v0, rx + ;; + cmp.ne p10, p11 = r0, r0 + xma.l fp0b_1 = uy, v0, ry + xma.hu fp1a_1 = uy, v0, ry + ;; + getfsig acc0 = fp0b_0 + (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s + (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s + ;; + xma.l fp0b_2 = uy, v1, r_2 + xma.hu fp1a_2 = uy, v1, r_2 + ;; + getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = u_2, v0, fp1a_1 + xma.hu fp2a_1 = u_2, v0, fp1a_1 + ;; + getfsig pr1_0 = fp1b_0 + getfsig acc1_0 = fp2a_0 + br L(cj3) + +L(gt3): ldf8 r_3 = [srp], 8 + xma.l fp0b_0 = ux, v0, rx + cmp.ne p10, p11 = r0, r0 + ldf8 u_3 = [up], 8 + xma.hu fp1b_0 = ux, v0, rx + cmp.ne p6, p7 = r0, r0 + ;; + xma.l fp0b_1 = uy, v0, ry + xma.hu fp1a_1 = uy, v0, ry + ;; + getfsig acc0 = fp0b_0 + ldf8 r_0 = [srp], 8 + (p15) xma.hu fp2a_0 = ux, v1, fp1b_0 C suppressed for addmul_2s + ldf8 u_0 = [up], 8 + (p15) xma.l fp1b_0 = ux, v1, fp1b_0 C suppressed for addmul_2s + ;; + xma.l fp0b_2 = u_2, v0, r_2 + xma.hu fp1a_2 = u_2, v0, r_2 + ;; + getfsig pr0_1 = fp0b_1 + ldf8 r_1 = [srp], 8 + xma.l fp1b_1 = uy, v1, fp1a_1 + xma.hu fp2a_1 = uy, v1, fp1a_1 + ;; + ldf8 u_1 = [up], 8 + getfsig pr1_0 = fp1b_0 + ;; + getfsig acc1_0 = fp2a_0 + xma.l fp0b_3 = u_3, v0, r_3 + xma.hu fp1a_3 = u_3, v0, r_3 + br L(11) + + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): C 00 + .pred.rel "mutex", p12, p13 + getfsig pr0_3 = fp0b_3 + ldf8 r_3 = [srp], 8 + xma.l fp1b_3 = u_3, v1, fp1a_3 + (p12) add s0 = pr1_0, acc0, 1 + (p13) add s0 = pr1_0, acc0 + xma.hu fp2a_3 = u_3, v1, fp1a_3 + ;; C 01 + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + ldf8 u_3 = [up], 8 + getfsig pr1_2 = fp1b_2 + (p8) cmp.leu p6, p7 = acc0, pr0_1 + (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; C 02 + .pred.rel "mutex", p6, p7 + getfsig acc1_2 = fp2a_2 + st8 [rp] = s0, 8 + xma.l fp0b_1 = u_1, v0, r_1 + (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + xma.hu fp1a_1 = u_1, v0, r_1 + ;; C 03 +L(01): + .pred.rel "mutex", p10, p11 + getfsig pr0_0 = fp0b_0 + ldf8 r_0 = [srp], 8 + xma.l fp1b_0 = u_0, v1, fp1a_0 + (p10) add s0 = pr1_1, acc0, 1 + (p11) add s0 = pr1_1, acc0 + xma.hu fp2a_0 = u_0, v1, fp1a_0 + ;; C 04 + .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + ldf8 u_0 = [up], 8 + getfsig pr1_3 = fp1b_3 + (p6) cmp.leu p8, p9 = acc0, pr0_2 + (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; C 05 + .pred.rel "mutex", p8, p9 + getfsig acc1_3 = fp2a_3 + st8 [rp] = s0, 8 + xma.l fp0b_2 = u_2, v0, r_2 + (p8) add acc0 = pr0_3, acc1_1, 1 + (p9) add acc0 = pr0_3, acc1_1 + xma.hu fp1a_2 = u_2, v0, r_2 + ;; C 06 +L(00): + .pred.rel "mutex", p12, p13 + getfsig pr0_1 = fp0b_1 + ldf8 r_1 = [srp], 8 + xma.l fp1b_1 = u_1, v1, fp1a_1 + (p12) add s0 = pr1_2, acc0, 1 + (p13) add s0 = pr1_2, acc0 + xma.hu fp2a_1 = u_1, v1, fp1a_1 + ;; C 07 + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + ldf8 u_1 = [up], 8 + getfsig pr1_0 = fp1b_0 + (p8) cmp.leu p6, p7 = acc0, pr0_3 + (p9) cmp.ltu p6, p7 = acc0, pr0_3 + (p12) cmp.leu p10, p11 = s0, pr1_2 + (p13) cmp.ltu p10, p11 = s0, pr1_2 + ;; C 08 + .pred.rel "mutex", p6, p7 + getfsig acc1_0 = fp2a_0 + st8 [rp] = s0, 8 + xma.l fp0b_3 = u_3, v0, r_3 + (p6) add acc0 = pr0_0, acc1_2, 1 + (p7) add acc0 = pr0_0, acc1_2 + xma.hu fp1a_3 = u_3, v0, r_3 + ;; C 09 +L(11): + .pred.rel "mutex", p10, p11 + getfsig pr0_2 = fp0b_2 + ldf8 r_2 = [srp], 8 + xma.l fp1b_2 = u_2, v1, fp1a_2 + (p10) add s0 = pr1_3, acc0, 1 + (p11) add s0 = pr1_3, acc0 + xma.hu fp2a_2 = u_2, v1, fp1a_2 + ;; C 10 + .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + ldf8 u_2 = [up], 8 + getfsig pr1_1 = fp1b_1 + (p6) cmp.leu p8, p9 = acc0, pr0_0 + (p7) cmp.ltu p8, p9 = acc0, pr0_0 + (p10) cmp.leu p12, p13 = s0, pr1_3 + (p11) cmp.ltu p12, p13 = s0, pr1_3 + ;; C 11 + .pred.rel "mutex", p8, p9 + getfsig acc1_1 = fp2a_1 + st8 [rp] = s0, 8 + xma.l fp0b_0 = u_0, v0, r_0 + (p8) add acc0 = pr0_1, acc1_3, 1 + (p9) add acc0 = pr0_1, acc1_3 + xma.hu fp1a_0 = u_0, v0, r_0 +L(10): br.cloop.sptk.clr L(top) C 12 + ;; +C *** MAIN LOOP END *** +L(end): + .pred.rel "mutex", p12, p13 + {.mfi; getfsig pr0_3 = fp0b_3 + xma.l fp1b_3 = u_3, v1, fp1a_3 + (p12) add s0 = pr1_0, acc0, 1 +}{.mfi; (p13) add s0 = pr1_0, acc0 + xma.hu fp2a_3 = u_3, v1, fp1a_3 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_2 = fp1b_2 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_1 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; +} .pred.rel "mutex", p6, p7 + {.mfi; getfsig acc1_2 = fp2a_2 + xma.l fp0b_1 = u_1, v0, r_1 + nop 1 +}{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + xma.hu fp1a_1 = u_1, v0, r_1 + ;; +} +L(cj5): + .pred.rel "mutex", p10, p11 + {.mfi; getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = u_0, v1, fp1a_0 + (p10) add s0 = pr1_1, acc0, 1 +}{.mfi; (p11) add s0 = pr1_1, acc0 + xma.hu fp2a_0 = u_0, v1, fp1a_0 + nop 1 + ;; +} .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + {.mmi; getfsig pr1_3 = fp1b_3 + st8 [rp] = s0, 8 + (p6) cmp.leu p8, p9 = acc0, pr0_2 +}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mfi; getfsig acc1_3 = fp2a_3 + xma.l fp0b_2 = u_2, v0, r_2 + nop 1 +}{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 + (p9) add acc0 = pr0_3, acc1_1 + xma.hu fp1a_2 = u_2, v0, r_2 + ;; +} +L(cj4): + .pred.rel "mutex", p12, p13 + {.mfi; getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = u_1, v1, fp1a_1 + (p12) add s0 = pr1_2, acc0, 1 +}{.mfi; (p13) add s0 = pr1_2, acc0 + xma.hu fp2a_1 = u_1, v1, fp1a_1 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_0 = fp1b_0 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_3 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 + (p12) cmp.leu p10, p11 = s0, pr1_2 + (p13) cmp.ltu p10, p11 = s0, pr1_2 + ;; +} .pred.rel "mutex", p6, p7 + {.mmi; getfsig acc1_0 = fp2a_0 + (p6) add acc0 = pr0_0, acc1_2, 1 + (p7) add acc0 = pr0_0, acc1_2 + ;; +} +L(cj3): + .pred.rel "mutex", p10, p11 + {.mfi; getfsig pr0_2 = fp0b_2 + xma.l fp1b_2 = u_2, v1, fp1a_2 + (p10) add s0 = pr1_3, acc0, 1 +}{.mfi; (p11) add s0 = pr1_3, acc0 + xma.hu fp2a_2 = u_2, v1, fp1a_2 + nop 1 + ;; +} .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + {.mmi; getfsig pr1_1 = fp1b_1 + st8 [rp] = s0, 8 + (p6) cmp.leu p8, p9 = acc0, pr0_0 +}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 + (p10) cmp.leu p12, p13 = s0, pr1_3 + (p11) cmp.ltu p12, p13 = s0, pr1_3 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; getfsig acc1_1 = fp2a_1 + (p8) add acc0 = pr0_1, acc1_3, 1 + (p9) add acc0 = pr0_1, acc1_3 + ;; +} .pred.rel "mutex", p12, p13 + {.mmi; (p12) add s0 = pr1_0, acc0, 1 + (p13) add s0 = pr1_0, acc0 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_2 = fp1b_2 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_1 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; +} .pred.rel "mutex", p6, p7 + {.mmi; getfsig r8 = fp2a_2 + (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + ;; +} .pred.rel "mutex", p10, p11 + {.mmi; (p10) add s0 = pr1_1, acc0, 1 + (p11) add s0 = pr1_1, acc0 + (p6) cmp.leu p8, p9 = acc0, pr0_2 + ;; +} .pred.rel "mutex", p10, p11 + {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; st8 [rp] = s0, 8 + (p8) add acc0 = pr1_2, acc1_1, 1 + (p9) add acc0 = pr1_2, acc1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 + (p9) cmp.ltu p10, p11 = acc0, pr1_2 + (p12) add acc0 = 1, acc0 + ;; +}{.mmi; st8 [rp] = acc0, 8 + (p12) cmpeqor p10, p0 = 0, acc0 + nop 1 + ;; +}{.mib; (p10) add r8 = 1, r8 + mov ar.lc = r2 + br.ret.sptk.many b0 +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/aors_n.asm b/gmp-6.3.0/mpn/ia64/aors_n.asm new file mode 100644 index 0000000..7705ce6 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aors_n.asm @@ -0,0 +1,852 @@ +dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2.67 +C Itanium 2: 1.25 + +C TODO +C * Consider using special code for small n, using something like +C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code. +C * The non-nc code was trimmed cycle for cycle to its current state. It is +C probably hard to save more that an odd cycle there. The nc code is much +C cruder (since tune/speed doesn't have any applicable direct measurements). +C * Without the nc entry points, this becomes around 1800 bytes of object +C code; the nc code adds over 1000 bytes. We should perhaps sacrifice a +C few cycles for the non-nc code and let it fall into the nc code. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`vp', `r34') +define(`n', `r35') +define(`cy', `r36') + +ifdef(`OPERATION_add_n',` + define(ADDSUB, add) + define(CND, ltu) + define(INCR, 1) + define(LIM, -1) + define(LIM2, 0) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) +') +ifdef(`OPERATION_sub_n',` + define(ADDSUB, sub) + define(CND, gtu) + define(INCR, -1) + define(LIM, 0) + define(LIM2, -1) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) +') + +define(PFDIST, 500) + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27') +define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31') +define(`rpx',`r3') +define(`upadv',`r20') define(`vpadv',`r21') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov r2 = ar.lc C I0 +}{.mmi; and r14 = 7, n C M I + cmp.lt p15, p14 = 8, n C M I + add n = -6, n C M I + ;; +}{.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in + add vpadv = PFDIST, vp C code could save a cycle per call at + mov r23 = cy C the expense of code size. + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb; (p6) br.dptk .Lc001 C B + (p7) br.dptk .Lc010 C B + (p8) br.dptk .Lc011 C B + ;; +}{.mmi; cmp.eq p9, p0 = 4, r14 C M I + cmp.eq p10, p0 = 5, r14 C M I + cmp.eq p11, p0 = 6, r14 C M I +}{.bbb; (p9) br.dptk .Lc100 C B + (p10) br.dptk .Lc101 C B + (p11) br.dptk .Lc110 C B + ;; +}{.mmi; ld8 r19 = [vp], 8 C M01 + ld8 r18 = [up], 8 C M01 + cmp.ne p13, p0 = 0, cy C copy cy to p13 M I +}{.mmb; cmp.eq p12, p0 = 7, r14 C M I + nop 0 + (p12) br.dptk .Lc111 C B + ;; +} + +.Lc000: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add vpadv = PFDIST, vp C M I + ld8 v0 = [vp], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r18, r19 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r18 C M I + (p13) cmpeqor p7, p0 = LIM, w1 C M I +}{.mmi; ld8 u2 = [up], 8 C M01 + (p13) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m0) +} + +.Lc001: + {.mmi; (p15) ld8 v1 = [vp], 8 C M01 + (p15) ld8 u1 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I +}{.mmb; nop 0 + nop 0 + (p15) br L(0) + ;; +}{.mmi; cmp.ne p9, p0 = 0, r23 C M I + mov r8 = 0 + cmp.CND p6, p0 = w0, r10 C M I + ;; +}{.mmb; (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + br L(cj1) C B +} +L(0): + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; nop 0 + cmp.ne p9, p0 = 0, r23 C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + add rpx = 16, rp C M I +}{.mmb; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + br L(c1) C B +} + +.Lc010: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov r8 = 0 C M I +}{.mmb; ADDSUB w3 = r10, r11 C M I + cmp.ne p8, p0 = 0, r23 C M I + (p15) br L(1) C B + ;; +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I + (p8) add w3 = INCR, w3 C M I + ;; +}{.mmb; cmp.CND p6, p0 = w0, u0 C M I + (p8) cmpeqor p9, p0 = LIM2, w3 C M I + br L(cj2) C B +} +L(1): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ;; +}{.mmi; (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; add rpx = 24, rp C M I + nop 0 + br L(m23) C B +} + +.Lc011: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmi; ADDSUB w2 = r10, r11 C M I + cmp.ne p7, p0 = 0, r23 C M I + nop 0 + ;; +}{.mmb; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + (p15) br L(2) C B +}{.mmi; cmp.CND p8, p0 = w2, r10 C M I + ADDSUB w3 = u3, v3 C M I + nop 0 + ;; +}{.mmb; (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I + br L(cj3) C B +} +L(2): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u3 = [up], 8 C M01 + (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m23) +} + +.Lc100: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmi; ADDSUB w1 = r10, r11 C M I + nop 0 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + add rpx = 8, rp C M I +}{.mmi; cmp.ne p6, p0 = 0, r23 C M I + cmp.CND p7, p0 = w1, r10 C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w2 = u2, v2 C M I +}{.mmb; (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I + (p14) br L(cj4) + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + nop 0 +}{.mmi; ld8 u2 = [up], 8 C M01 + nop 0 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m4) +} + +.Lc101: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I +}{.mmi; cmp.ne p9, p0 = 0, r23 C M I + add rpx = 16, rp C M I + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I +}{.mbb; ADDSUB w1 = u1, v1 C M I + (p15) br L(c5) C B + br L(end) C B +} + +.Lc110: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + add vpadv = PFDIST, vp C M I + mov ar.lc = n C I0 +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = r10, r11 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w0 = u0, v0 C M I +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + cmp.ne p8, p0 = 0, r23 C M I + add rpx = 24, rp C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + nop 0 +}{.mmb; (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I + br L(m67) C B +} + +.Lc111: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up C M I + ld8 v1 = [vp], 8 C M01 + mov ar.lc = n C I0 +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + nop 0 + ;; +}{.mmi; add vpadv = PFDIST, vp C M I + ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = r18, r19 C M I + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r18 C M I + (p13) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmi; ld8 u3 = [up], 8 C M01 + (p13) add w2 = INCR, w2 C M I + nop 0 + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m67) +} +EPILOGUE() + +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov r2 = ar.lc C I0 +}{.mmi; and r14 = 7, n C M I + cmp.lt p15, p14 = 8, n C M I + add n = -6, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb; (p6) br.dptk .Lb001 C B + (p7) br.dptk .Lb010 C B + (p8) br.dptk .Lb011 C B + ;; +}{.mmi; cmp.eq p9, p0 = 4, r14 C M I + cmp.eq p10, p0 = 5, r14 C M I + cmp.eq p11, p0 = 6, r14 C M I +}{.bbb; (p9) br.dptk .Lb100 C B + (p10) br.dptk .Lb101 C B + (p11) br.dptk .Lb110 C B + ;; +}{.mmi; ld8 r19 = [vp], 8 C M01 + ld8 r18 = [up], 8 C M01 + cmp.ne p13, p0 = r0, r0 C clear "CF" M I +}{.mmb; cmp.eq p12, p0 = 7, r14 C M I + mov r23 = 0 C M I + (p12) br.dptk .Lb111 C B + ;; +} + +.Lb000: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r18, r19 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + cmp.CND p8, p0 = w2, r18 C M I +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m0) C B +} + + ALIGN(32) +.Lb001: + {.mmi; ADDSUB w0 = r10, r11 C M I + (p15) ld8 v1 = [vp], 8 C M01 + mov r8 = 0 C M I + ;; +}{.mmb; cmp.CND p6, p0 = w0, r10 C M I + (p15) ld8 u1 = [up], 8 C M01 + (p14) br L(cj1) C B + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + shr.u n = n, 3 C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + mov ar.lc = n C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + ADDSUB w2 = u2, v2 C M I +}{.mmb; ld8 u1 = [up], 8 C M01 + add rpx = 16, rp C M I + br L(m1) C B +} + + ALIGN(32) +.Lb010: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 +}{.mmb; ADDSUB w3 = r10, r11 C M I + nop 0 + (p15) br L(gt2) C B + ;; +}{.mmi; cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I + mov r8 = 0 C M I + ;; +}{.mmb; nop 0 + cmp.CND p6, p0 = w0, u0 C M I + br L(cj2) C B +} +L(gt2): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + add rpx = 24, rp C M I + br L(m23) C B +} + + ALIGN(32) +.Lb011: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + ;; +}{.mmb; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + (p15) br L(3) C B +}{.mmb; cmp.CND p8, p0 = w2, r10 C M I + ADDSUB w3 = u3, v3 C M I + br L(cj3) C B +} +L(3): + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + ADDSUB w3 = u3, v3 C M I +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u3 = [up], 8 C M01 + nop 0 + nop 0 + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m23) C B +} + + ALIGN(32) +.Lb100: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ADDSUB w1 = r10, r11 C M I + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + cmp.CND p7, p0 = w1, r10 C M I +}{.mmb; nop 0 + ADDSUB w2 = u2, v2 C M I + (p14) br L(cj4) C B + ;; +} +L(gt4): + {.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + nop 0 +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + add rpx = 8, rp C M I + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + br L(m4) C B +} + + ALIGN(32) +.Lb101: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + add rpx = 16, rp C M I +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, r10 C M I + nop 0 +}{.mmb; ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + (p14) br L(cj5) C B + ;; +} +L(gt5): + {.mmi; ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + mov ar.lc = n C I0 +}{.mmb; ld8 u1 = [up], 8 C M01 + ADDSUB w2 = u2, v2 C M I + br L(m5) C B +} + + ALIGN(32) +.Lb110: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = r10, r11 C M I + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + mov ar.lc = n C I0 +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + nop 0 + ;; +}{.mmi; ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, r10 C M I + ADDSUB w0 = u0, v0 C M I +}{.mmb; ld8 u3 = [up], 8 C M01 + add rpx = 24, rp C M I + br L(m67) C B +} + + ALIGN(32) +.Lb111: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 3 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w2 = r10, r11 C M I + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, r10 C M I + mov ar.lc = n C I0 +}{.mmi; ld8 u2 = [up], 8 C M01 + ADDSUB w3 = r18, r19 C M I + nop 0 + ;; +}{.mmi; add upadv = PFDIST, up + add vpadv = PFDIST, vp + nop 0 +}{.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + cmp.CND p9, p0 = w3, r18 C M I + ;; +}{.mmi; add rpx = 32, rp C M I + st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I +}{.mmb; (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + br L(m67) C B +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): +L(c5): ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + (p9) cmpeqor p6, p0 = LIM, w0 C M I + ld8 u1 = [up], 8 C M01 + (p9) add w0 = INCR, w0 C M I + ADDSUB w2 = u2, v2 C M I + ;; +L(m5): ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + ld8 u2 = [up], 8 C M01 + (p6) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; + st8 [rp] = w0, 8 C M23 + ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I + ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + ;; +L(m4): st8 [rp] = w1, 16 C M23 + st8 [rpx] = w2, 32 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + lfetch [upadv], 64 + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + ;; +L(m23): st8 [rp] = w3, 8 C M23 + ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, u0 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + nop.b 0 + ;; +L(c1): ld8 v1 = [vp], 8 C M01 + cmp.CND p7, p0 = w1, u1 C M I + (p9) cmpeqor p6, p0 = LIM, w0 C M I + ld8 u1 = [up], 8 C M01 + (p9) add w0 = INCR, w0 C M I + ADDSUB w2 = u2, v2 C M I + ;; +L(m1): ld8 v2 = [vp], 8 C M01 + cmp.CND p8, p0 = w2, u2 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + ld8 u2 = [up], 8 C M01 + (p6) add w1 = INCR, w1 C M I + ADDSUB w3 = u3, v3 C M I + ;; + st8 [rp] = w0, 8 C M23 + ld8 v3 = [vp], 8 C M01 + cmp.CND p9, p0 = w3, u3 C M I + (p7) cmpeqor p8, p0 = LIM, w2 C M I + ld8 u3 = [up], 8 C M01 + (p7) add w2 = INCR, w2 C M I + ;; +L(m0): st8 [rp] = w1, 16 C M23 + st8 [rpx] = w2, 32 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + lfetch [vpadv], 64 + (p8) add w3 = INCR, w3 C M I + ADDSUB w0 = u0, v0 C M I + ;; +L(m67): st8 [rp] = w3, 8 C M23 + ld8 v0 = [vp], 8 C M01 + cmp.CND p6, p0 = w0, u0 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + br.cloop.dptk L(top) C B + ;; +C *** MAIN LOOP END *** + +L(end): + {.mmi; (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + mov ar.lc = r2 C I0 +} +L(cj5): + {.mmi; cmp.CND p7, p0 = w1, u1 C M I + ADDSUB w2 = u2, v2 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w0, 8 C M23 + (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I +} +L(cj4): + {.mmi; cmp.CND p8, p0 = w2, u2 C M I + ADDSUB w3 = u3, v3 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w1, 8 C M23 + (p7) cmpeqor p8, p0 = LIM, w2 C M I + (p7) add w2 = INCR, w2 C M I +} +L(cj3): + {.mmi; cmp.CND p9, p0 = w3, u3 C M I + ADDSUB w0 = u0, v0 C M I + nop 0 + ;; +}{.mmi; st8 [rp] = w2, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I +}{.mmi; cmp.CND p6, p0 = w0, u0 C M I + nop 0 + mov r8 = 0 C M I + ;; +} +L(cj2): + {.mmi; st8 [rp] = w3, 8 C M23 + (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + ;; +} +L(cj1): + {.mmb; st8 [rp] = w0, 8 C M23 + (p6) mov r8 = 1 C M I + br.ret.sptk.many b0 C B +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm new file mode 100644 index 0000000..9b58b9e --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aorsorrlsh1_n.asm @@ -0,0 +1,48 @@ +dnl IA-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 3.0 +C Itanium 2: 1.5 + + +define(LSH, 1) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`ia64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm new file mode 100644 index 0000000..39b384a --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aorsorrlsh2_n.asm @@ -0,0 +1,48 @@ +dnl IA-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 3.0 +C Itanium 2: 1.5 + + +define(LSH, 2) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`ia64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm new file mode 100644 index 0000000..2703ce2 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/aorsorrlshC_n.asm @@ -0,0 +1,412 @@ +dnl IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.5 + +C TODO +C * Use shladd in feed-in code (for mpn_addlshC_n). +C * Rewrite loop to schedule loads closer to use, since we do prefetch. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`vp', `r34') +define(`n', `r35') + +ifdef(`DO_add', ` + define(`ADDSUB', `add $1 = $2, $3') + define(`CMP', `cmp.ltu $1,p0 = $2, $3') + define(`INCR', 1) + define(`LIM', -1) + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUB', `sub $1 = $2, $3') + define(`CMP', `cmp.gtu $1,p0 = $2, $3') + define(`INCR', -1) + define(`LIM', 0) + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUB', `sub $1 = $3, $2') + define(`CMP', `cmp.gtu $1,p0 = $2, $4') + define(`INCR', -1) + define(`LIM', 0) + define(`func', mpn_rsblsh`'LSH`'_n)') + +define(PFDIST, 500) + +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21') +define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25') +define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29') +define(`x0',`r30') define(`x1',`r31') define(`x2',`r3') define(`x3',`r9') + +C r3 r8 r9 r10 r11 + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov.i r2 = ar.lc C I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p0 = 4, n C M I + add n = -5, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + +.Lb00: + {.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; +}{.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shl x3 = r11, LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, r11, 64-LSH C I0 +}{.mmb; ADDSUB( w3, r10, x3) C M I + nop 0 + (p15) br.dpnt .grt4 C B + ;; +}{.mii; CMP( p7, w3, r10, x3) C M II0 + shrp x1 = v1, v0, 64-LSH C I0 + ADDSUB( w0, u0, x0) C M I + ;; +}{.mii; CMP( p8, w0, u0, x0) C M I + shrp x2 = v2, v1, 64-LSH C I0 + ADDSUB( w1, u1, x1) C M I +}{.mmb; nop 0 + nop 0 + br .Lcj4 C B +} +ALIGN(32) +.grt4: + {.mii; ld8 v3 = [vp], 8 C M01 + shrp x0 = v0, r11, 64-LSH C I0 + CMP( p8, w3, r10, x3) C M I + ;; +}{.mmi; ld8 u3 = [up], 8 C M01 + add r11 = PFDIST, vp + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmi; ld8 v0 = [vp], 8 C M01 + ADDSUB( w0, u0, x0) C M I + nop 0 + ;; +}{.mmi; CMP( p6, w0, u0, x0) C M I + add r10 = PFDIST, up + mov.i ar.lc = n C I0 +}{.mmb; ADDSUB( w1, u1, x1) C M I + ld8 u0 = [up], 8 C M01 + br .LL00 C B +} + + ALIGN(32) +.Lb01: +ifdef(`DO_add', +` shladd w2 = r11, LSH, r10 C M I + shr.u r8 = r11, 64-LSH C retval I0 + (p15) br.dpnt .grt1 C B + ;; +',` + shl x2 = r11, LSH C I0 + (p15) br.dpnt .grt1 C B + ;; + ADDSUB( w2, r10, x2) C M I + shr.u r8 = r11, 64-LSH C retval I0 + ;; +') + CMP( p6, w2, r10, x2) C M I + br .Lcj1 + +.grt1: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C FIXME swap with next I0 +ifdef(`DO_add', +`',` + ADDSUB( w2, r10, x2) +') + ;; + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shrp x3 = v3, r11, 64-LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, v3, 64-LSH C I0 +}{.mmb; CMP( p6, w2, r10, x2) C M I + ADDSUB( w3, u3, x3) C M I + br.cloop.dptk .grt5 C B + ;; +}{.mmi; CMP( p7, w3, u3, x3) C M I + ADDSUB( w0, u0, x0) C M I + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmb; nop 0 + nop 0 + br .Lcj5 C B +} +.grt5: + {.mmi; add r10 = PFDIST, up + add r11 = PFDIST, vp + shrp x0 = v0, v3, 64-LSH C I0 +}{.mmb; ld8 v3 = [vp], 8 C M01 + CMP( p8, w3, u3, x3) C M I + br .LL01 C B +} + ALIGN(32) +.Lb10: + {.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shl x1 = r11, LSH C I0 +}{.mmb; nop 0 + nop 0 + (p15) br.dpnt .grt2 C B + ;; +}{.mmi; ADDSUB( w1, r10, x1) C M I + nop 0 + shrp x2 = v2, r11, 64-LSH C I0 + ;; +}{.mmi; CMP( p9, w1, r10, x1) C M I + ADDSUB( w2, u2, x2) C M I + shr.u r8 = v2, 64-LSH C retval I0 + ;; +}{.mmb; CMP( p6, w2, u2, x2) C M I + nop 0 + br .Lcj2 C B +} +.grt2: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C I0 +}{.mmi; ADDSUB( w1, r10, x1) C M I + nop 0 + nop 0 + ;; +}{.mii; ld8 v1 = [vp], 8 C M01 + shrp x2 = v2, r11, 64-LSH C I0 + CMP( p8, w1, r10, x1) C M I + ;; +}{.mmi; add r10 = PFDIST, up + ld8 u1 = [up], 8 C M01 + shrp x3 = v3, v2, 64-LSH C I0 +}{.mmi; add r11 = PFDIST, vp + ld8 v2 = [vp], 8 C M01 + ADDSUB( w2, u2, x2) C M I + ;; +}{.mmi; CMP( p6, w2, u2, x2) C M I + ld8 u2 = [up], 8 C M01 + shrp x0 = v0, v3, 64-LSH C I0 +}{.mib; ADDSUB( w3, u3, x3) C M I + nop 0 + br.cloop.dpnt L(top) C B +} + br L(end) C B +.Lb11: + {.mmi; ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shl x0 = r11, LSH C I0 + ;; +}{.mmi; ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 2 C I0 +}{.mmb; nop 0 + nop 0 + (p15) br.dpnt .grt3 C B + ;; +}{.mii; nop 0 + shrp x1 = v1, r11, 64-LSH C I0 + ADDSUB( w0, r10, x0) C M I + ;; +}{.mii; CMP( p8, w0, r10, x0) C M I + shrp x2 = v2, v1, 64-LSH C I0 + ADDSUB( w1, u1, x1) C M I + ;; +}{.mmb; CMP( p9, w1, u1, x1) C M I + ADDSUB( w2, u2, x2) C M I + br .Lcj3 C B +} +.grt3: + {.mmi; ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + shrp x1 = v1, r11, 64-LSH C I0 +}{.mmi; ADDSUB( w0, r10, x0) C M I + nop 0 + nop 0 + ;; +}{.mmi; ld8 v0 = [vp], 8 C M01 + CMP( p6, w0, r10, x0) C M I + mov.i ar.lc = n C I0 +}{.mmi; ld8 u0 = [up], 8 C M01 + ADDSUB( w1, u1, x1) C M I + nop 0 + ;; +}{.mmi; add r10 = PFDIST, up + add r11 = PFDIST, vp + shrp x2 = v2, v1, 64-LSH C I0 +}{.mmb; ld8 v1 = [vp], 8 C M01 + CMP( p8, w1, u1, x1) C M I + br .LL11 C B +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): st8 [rp] = w1, 8 C M23 + lfetch [r10], 32 + (p8) cmpeqor p6, p0 = LIM, w2 C M I + (p8) add w2 = INCR, w2 C M I + ld8 v3 = [vp], 8 C M01 + CMP( p8, w3, u3, x3) C M I + ;; +.LL01: ld8 u3 = [up], 8 C M01 + shrp x1 = v1, v0, 64-LSH C I0 + (p6) cmpeqor p8, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ld8 v0 = [vp], 8 C M01 + ADDSUB( w0, u0, x0) C M I + ;; + st8 [rp] = w2, 8 C M23 + CMP( p6, w0, u0, x0) C M I + nop.b 0 + ld8 u0 = [up], 8 C M01 + lfetch [r11], 32 + ADDSUB( w1, u1, x1) C M I + ;; +.LL00: st8 [rp] = w3, 8 C M23 + shrp x2 = v2, v1, 64-LSH C I0 + (p8) cmpeqor p6, p0 = LIM, w0 C M I + (p8) add w0 = INCR, w0 C M I + ld8 v1 = [vp], 8 C M01 + CMP( p8, w1, u1, x1) C M I + ;; +.LL11: ld8 u1 = [up], 8 C M01 + shrp x3 = v3, v2, 64-LSH C I0 + (p6) cmpeqor p8, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I + ld8 v2 = [vp], 8 C M01 + ADDSUB( w2, u2, x2) C M I + ;; + {.mmi; st8 [rp] = w0, 8 C M23 + CMP( p6, w2, u2, x2) C M I + shrp x0 = v0, v3, 64-LSH C I0 +}{.mib; + ld8 u2 = [up], 8 C M01 + ADDSUB( w3, u3, x3) C M I + br.cloop.dptk L(top) C B + ;; +} +C *** MAIN LOOP END *** + +L(end): + {.mmi; st8 [rp] = w1, 8 C M23 + (p8) cmpeqor p6, p0 = LIM, w2 C M I + shrp x1 = v1, v0, 64-LSH C I0 +}{.mmi; + (p8) add w2 = INCR, w2 C M I + CMP( p7, w3, u3, x3) C M I + ADDSUB( w0, u0, x0) C M I + ;; +} +.Lcj5: + {.mmi; st8 [rp] = w2, 8 C M23 + (p6) cmpeqor p7, p0 = LIM, w3 C M I + shrp x2 = v2, v1, 64-LSH C I0 +}{.mmi; + (p6) add w3 = INCR, w3 C M I + CMP( p8, w0, u0, x0) C M I + ADDSUB( w1, u1, x1) C M I + ;; +} +.Lcj4: + {.mmi; st8 [rp] = w3, 8 C M23 + (p7) cmpeqor p8, p0 = LIM, w0 C M I + mov.i ar.lc = r2 C I0 +}{.mmi; + (p7) add w0 = INCR, w0 C M I + CMP( p9, w1, u1, x1) C M I + ADDSUB( w2, u2, x2) C M I + ;; +} +.Lcj3: + {.mmi; st8 [rp] = w0, 8 C M23 + (p8) cmpeqor p9, p0 = LIM, w1 C M I + shr.u r8 = v2, 64-LSH C I0 +}{.mmi; + (p8) add w1 = INCR, w1 C M I + CMP( p6, w2, u2, x2) C M I + nop 0 + ;; +} +.Lcj2: + {.mmi; st8 [rp] = w1, 8 C M23 + (p9) cmpeqor p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +} +.Lcj1: + {.mmb; st8 [rp] = w2 C M23 +ifdef(`DO_rsb',` + (p6) add r8 = -1, r8 C M I +',` + (p6) add r8 = 1, r8 C M I +') br.ret.sptk.many b0 C B +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm new file mode 100644 index 0000000..47e4553 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/bdiv_dbm1c.asm @@ -0,0 +1,516 @@ +dnl IA-64 mpn_bdiv_dbm1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2009 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 4 +C Itanium 2: 2 + +C TODO +C * Optimize feed-in and wind-down code, both for speed and code size. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`bd', `r35') + +ASM_START() +PROLOGUE(mpn_bdiv_dbm1c) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmb + mov r15 = r36 C M I + ldf8 f9 = [up], 8 C M + nop.b 0 C B +} +.Lcommon: +{.mii + adds r16 = -1, n C M I + mov r2 = ar.lc C I0 + and r14 = 3, n C M I + ;; +} +{.mii + setf.sig f6 = bd C M2 M3 + shr.u r31 = r16, 2 C I0 + cmp.eq p10, p0 = 0, r14 C M I +} +{.mii + nop.m 0 C M + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + ;; +} +{.mii + cmp.ne p6, p7 = r0, r0 C M I + mov.i ar.lc = r31 C I0 + cmp.ne p8, p9 = r0, r0 C M I +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: br.cloop.dptk .grt1 + ;; + xma.l f38 = f9, f6, f0 + xma.hu f39 = f9, f6, f0 + ;; + getf.sig r26 = f38 + getf.sig r27 = f39 + br .Lcj1 + +.grt1: ldf8 f10 = [r33], 8 + ;; + ldf8 f11 = [r33], 8 + ;; + ldf8 f12 = [r33], 8 + ;; + xma.l f38 = f9, f6, f0 + xma.hu f39 = f9, f6, f0 + ;; + ldf8 f13 = [r33], 8 + ;; + xma.l f32 = f10, f6, f0 + xma.hu f33 = f10, f6, f0 + br.cloop.dptk .grt5 + + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r27 = f39 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r21 = f33 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + br .Lcj5 + +.grt5: ldf8 f10 = [r33], 8 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r27 = f39 + ldf8 f11 = [r33], 8 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r21 = f33 + ldf8 f12 = [r33], 8 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + br .LL01 + +.Lb10: ldf8 f13 = [r33], 8 + br.cloop.dptk .grt2 + ;; + + xma.l f36 = f9, f6, f0 + xma.hu f37 = f9, f6, f0 + ;; + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r24 = f36 + ;; + getf.sig r25 = f37 + ;; + getf.sig r26 = f38 + ;; + getf.sig r27 = f39 + br .Lcj2 + +.grt2: ldf8 f10 = [r33], 8 + ;; + ldf8 f11 = [r33], 8 + ;; + xma.l f36 = f9, f6, f0 + xma.hu f37 = f9, f6, f0 + ;; + ldf8 f12 = [r33], 8 + ;; + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + ldf8 f13 = [r33], 8 + ;; + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + xma.hu f33 = f10, f6, f0 + br.cloop.dptk .grt6 + + getf.sig r25 = f37 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r27 = f39 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + br .Lcj6 + +.grt6: getf.sig r25 = f37 + ldf8 f10 = [r33], 8 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r27 = f39 + ldf8 f11 = [r33], 8 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + br .LL10 + + +.Lb11: ldf8 f12 = [r33], 8 + ;; + ldf8 f13 = [r33], 8 + br.cloop.dptk .grt3 + ;; + + xma.l f34 = f9, f6, f0 + xma.hu f35 = f9, f6, f0 + ;; + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r23 = f35 + ;; + getf.sig r24 = f36 + ;; + getf.sig r25 = f37 + ;; + getf.sig r26 = f38 + br .Lcj3 + +.grt3: ldf8 f10 = [r33], 8 + ;; + xma.l f34 = f9, f6, f0 + xma.hu f35 = f9, f6, f0 + ;; + ldf8 f11 = [r33], 8 + ;; + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + ldf8 f12 = [r33], 8 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r23 = f35 + ldf8 f13 = [r33], 8 + ;; + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + xma.hu f33 = f10, f6, f0 + br.cloop.dptk .grt7 + + getf.sig r25 = f37 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + br .Lcj7 + +.grt7: getf.sig r25 = f37 + ldf8 f10 = [r33], 8 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + br .LL11 + + +.Lb00: ldf8 f11 = [r33], 8 + ;; + ldf8 f12 = [r33], 8 + ;; + ldf8 f13 = [r33], 8 + br.cloop.dptk .grt4 + ;; + + xma.l f32 = f9, f6, f0 + xma.hu f33 = f9, f6, f0 + ;; + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r21 = f33 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r23 = f35 + ;; + getf.sig r24 = f36 + br .Lcj4 + +.grt4: xma.l f32 = f9, f6, f0 + xma.hu f33 = f9, f6, f0 + ;; + ldf8 f10 = [r33], 8 + ;; + xma.l f34 = f11, f6, f0 + xma.hu f35 = f11, f6, f0 + ;; + ldf8 f11 = [r33], 8 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + xma.hu f37 = f12, f6, f0 + ;; + getf.sig r21 = f33 + ldf8 f12 = [r33], 8 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + xma.hu f39 = f13, f6, f0 + ;; + getf.sig r23 = f35 + ldf8 f13 = [r33], 8 + ;; + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + xma.hu f33 = f10, f6, f0 + br.cloop.dptk .LL00 + br .Lcj8 + +C *** MAIN LOOP START *** + ALIGN(32) +.Ltop: + .pred.rel "mutex",p6,p7 +C .mfi + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + (p6) sub r15 = r19, r27, 1 +C .mfi + st8 [r32] = r19, 8 + xma.hu f33 = f10, f6, f0 + (p7) sub r15 = r19, r27 + ;; +.LL00: +C .mfi + getf.sig r25 = f37 + nop.f 0 + cmp.ltu p6, p7 = r15, r20 +C .mib + ldf8 f10 = [r33], 8 + sub r16 = r15, r20 + nop.b 0 + ;; + +C .mfi + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + (p6) sub r15 = r16, r21, 1 +C .mfi + st8 [r32] = r16, 8 + xma.hu f35 = f11, f6, f0 + (p7) sub r15 = r16, r21 + ;; +.LL11: +C .mfi + getf.sig r27 = f39 + nop.f 0 + cmp.ltu p6, p7 = r15, r22 +C .mib + ldf8 f11 = [r33], 8 + sub r17 = r15, r22 + nop.b 0 + ;; + +C .mfi + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + (p6) sub r15 = r17, r23, 1 +C .mfi + st8 [r32] = r17, 8 + xma.hu f37 = f12, f6, f0 + (p7) sub r15 = r17, r23 + ;; +.LL10: +C .mfi + getf.sig r21 = f33 + nop.f 0 + cmp.ltu p6, p7 = r15, r24 +C .mib + ldf8 f12 = [r33], 8 + sub r18 = r15, r24 + nop.b 0 + ;; + +C .mfi + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + (p6) sub r15 = r18, r25, 1 +C .mfi + st8 [r32] = r18, 8 + xma.hu f39 = f13, f6, f0 + (p7) sub r15 = r18, r25 + ;; +.LL01: +C .mfi + getf.sig r23 = f35 + nop.f 0 + cmp.ltu p6, p7 = r15, r26 +C .mib + ldf8 f13 = [r33], 8 + sub r19 = r15, r26 + br.cloop.sptk.few .Ltop +C *** MAIN LOOP END *** + ;; + + getf.sig r24 = f36 + xma.l f32 = f10, f6, f0 + (p6) sub r15 = r19, r27, 1 + st8 [r32] = r19, 8 + xma.hu f33 = f10, f6, f0 + (p7) sub r15 = r19, r27 + ;; +.Lcj8: getf.sig r25 = f37 + cmp.ltu p6, p7 = r15, r20 + sub r16 = r15, r20 + ;; + getf.sig r26 = f38 + xma.l f34 = f11, f6, f0 + (p6) sub r15 = r16, r21, 1 + st8 [r32] = r16, 8 + xma.hu f35 = f11, f6, f0 + (p7) sub r15 = r16, r21 + ;; +.Lcj7: getf.sig r27 = f39 + cmp.ltu p6, p7 = r15, r22 + sub r17 = r15, r22 + ;; + getf.sig r20 = f32 + xma.l f36 = f12, f6, f0 + (p6) sub r15 = r17, r23, 1 + st8 [r32] = r17, 8 + xma.hu f37 = f12, f6, f0 + (p7) sub r15 = r17, r23 + ;; +.Lcj6: getf.sig r21 = f33 + cmp.ltu p6, p7 = r15, r24 + sub r18 = r15, r24 + ;; + getf.sig r22 = f34 + xma.l f38 = f13, f6, f0 + (p6) sub r15 = r18, r25, 1 + st8 [r32] = r18, 8 + xma.hu f39 = f13, f6, f0 + (p7) sub r15 = r18, r25 + ;; +.Lcj5: getf.sig r23 = f35 + cmp.ltu p6, p7 = r15, r26 + sub r19 = r15, r26 + ;; + getf.sig r24 = f36 + (p6) sub r15 = r19, r27, 1 + st8 [r32] = r19, 8 + (p7) sub r15 = r19, r27 + ;; +.Lcj4: getf.sig r25 = f37 + cmp.ltu p6, p7 = r15, r20 + sub r16 = r15, r20 + ;; + getf.sig r26 = f38 + (p6) sub r15 = r16, r21, 1 + st8 [r32] = r16, 8 + (p7) sub r15 = r16, r21 + ;; +.Lcj3: getf.sig r27 = f39 + cmp.ltu p6, p7 = r15, r22 + sub r17 = r15, r22 + ;; + (p6) sub r15 = r17, r23, 1 + st8 [r32] = r17, 8 + (p7) sub r15 = r17, r23 + ;; +.Lcj2: cmp.ltu p6, p7 = r15, r24 + sub r18 = r15, r24 + ;; + (p6) sub r15 = r18, r25, 1 + st8 [r32] = r18, 8 + (p7) sub r15 = r18, r25 + ;; +.Lcj1: cmp.ltu p6, p7 = r15, r26 + sub r19 = r15, r26 + ;; + (p6) sub r8 = r19, r27, 1 + st8 [r32] = r19 + (p7) sub r8 = r19, r27 + mov ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm b/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm new file mode 100644 index 0000000..edd0552 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/cnd_aors_n.asm @@ -0,0 +1,264 @@ +dnl IA-64 mpn_cnd_add_n/mpn_cnd_sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.5 + +C INPUT PARAMETERS +define(`cnd', `r32') +define(`rp', `r33') +define(`up', `r34') +define(`vp', `r35') +define(`n', `r36') + +ifdef(`OPERATION_cnd_add_n',` + define(ADDSUB, add) + define(CND, ltu) + define(INCR, 1) + define(LIM, -1) + define(func, mpn_cnd_add_n) +') +ifdef(`OPERATION_cnd_sub_n',` + define(ADDSUB, sub) + define(CND, gtu) + define(INCR, -1) + define(LIM, 0) + define(func, mpn_cnd_sub_n) +') + +define(PFDIST, 160) + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`x0',`r20') define(`x1',`r21') define(`x2',`r22') define(`x3',`r23') +define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27') +define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31') +define(`up1',`up') define(`up2',`r8') define(`upadv',`r1') +define(`vp1',`vp') define(`vp2',`r9') define(`vpadv',`r11') +define(`rp1',`rp') define(`rp2',`r10') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + nop.i 0 + addp4 vp = 0, vp C M I + nop.m 0 + zxt4 n = n C I + ;; +') + {.mmi; and r3 = 3, n C M I + add n = -1, n C M I + mov r2 = ar.lc C I0 +}{.mmi; cmp.ne p6, p7 = 0, cnd C M I + add vp2 = 8, vp C M I + add up2 = 8, up C M I + ;; +}{.mmi; add upadv = PFDIST, up C M I + add vpadv = PFDIST, vp C M I + shr.u n = n, 2 C I0 + .pred.rel "mutex", p6, p7 +}{.mmi; add rp2 = 8, rp C M I + (p6) mov cnd = -1 C M I + (p7) mov cnd = 0 C M I + ;; +} cmp.eq p9, p0 = 1, r3 C M I + cmp.eq p7, p0 = 2, r3 C M I + cmp.eq p8, p0 = 3, r3 C M I + (p9) br L(b1) C B + (p7) br L(b2) C B + (p8) br L(b3) C B + ;; +L(b0): + {.mmi; ld8 v2 = [vp1], 16 C M01 + ld8 v3 = [vp2], 16 C M01 + mov ar.lc = n C I0 + ;; +} ld8 u2 = [up1], 16 C M01 + ld8 u3 = [up2], 16 C M01 + and x2 = v2, cnd C M I + and x3 = v3, cnd C M I + ;; + ADDSUB w2 = u2, x2 C M I + ADDSUB w3 = u3, x3 C M I + ;; + ld8 v0 = [vp1], 16 C M01 + ld8 v1 = [vp2], 16 C M01 + cmp.CND p8, p0 = w2, u2 C M I + cmp.CND p9, p0 = w3, u3 C M I + br L(lo0) + +L(b1): ld8 v1 = [vp1], 8 C M01 + add vp2 = 8, vp2 C M I + add rp2 = 8, rp2 C M I + ;; + ld8 u1 = [up1], 8 C M01 + add up2 = 8, up2 C M I + and x1 = v1, cnd C M I + ;; + ADDSUB w1 = u1, x1 C M I + cmp.ne p10, p0 = 0, n + add n = -1, n + ;; + cmp.CND p7, p0 = w1, u1 C M I + st8 [rp1] = w1, 8 C M23 + (p10) br L(b0) + ;; + mov r8 = 0 C M I + br L(e1) + +L(b3): ld8 v3 = [vp1], 8 C M01 + add vp2 = 8, vp2 C M I + add rp2 = 8, rp2 C M I + ;; + ld8 u3 = [up1], 8 C M01 + add up2 = 8, up2 C M I + and x3 = v3, cnd C M I + ;; + ADDSUB w3 = u3, x3 C M I + ;; + cmp.CND p9, p0 = w3, u3 C M I + st8 [rp1] = w3, 8 C M23 + C fall through + +L(b2): + {.mmi; ld8 v0 = [vp1], 16 C M01 + ld8 v1 = [vp2], 16 C M01 + mov ar.lc = n C I0 + ;; +} ld8 u0 = [up1], 16 C M01 + ld8 u1 = [up2], 16 C M01 + and x0 = v0, cnd C M I + and x1 = v1, cnd C M I + ;; + ADDSUB w0 = u0, x0 C M I + ADDSUB w1 = u1, x1 C M I + br.cloop.dptk L(gt2) C B + ;; + cmp.CND p6, p0 = w0, u0 C M I + br L(e2) C B +L(gt2): + ld8 v2 = [vp1], 16 C M01 + ld8 v3 = [vp2], 16 C M01 + cmp.CND p6, p0 = w0, u0 C M I + cmp.CND p7, p0 = w1, u1 C M I + br L(lo2) C B + + +C *** MAIN LOOP START *** +C ALIGN(32) +L(top): + {.mmi; ld8 v2 = [vp1], 16 C M01 + ld8 v3 = [vp2], 16 C M01 + cmp.CND p6, p0 = w0, u0 C M I +}{.mmi; st8 [rp1] = w2, 16 C M23 + st8 [rp2] = w3, 16 C M23 + cmp.CND p7, p0 = w1, u1 C M I + ;; +} +L(lo2): + {.mmi; ld8 u2 = [up1], 16 C M01 + ld8 u3 = [up2], 16 C M01 + (p9) cmpeqor p6, p0 = LIM, w0 C M I +}{.mmi; and x2 = v2, cnd C M I + and x3 = v3, cnd C M I + (p9) add w0 = INCR, w0 C M I + ;; +}{.mmi; ADDSUB w2 = u2, x2 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I +}{.mmi; ADDSUB w3 = u3, x3 C M I + lfetch [upadv], 32 + nop 0 + ;; +}{.mmi; ld8 v0 = [vp1], 16 C M01 + ld8 v1 = [vp2], 16 C M01 + cmp.CND p8, p0 = w2, u2 C M I +}{.mmi; st8 [rp1] = w0, 16 C M23 + st8 [rp2] = w1, 16 C M23 + cmp.CND p9, p0 = w3, u3 C M I + ;; +} +L(lo0): + {.mmi; ld8 u0 = [up1], 16 C M01 + ld8 u1 = [up2], 16 C M01 + (p7) cmpeqor p8, p0 = LIM, w2 C M I +}{.mmi; and x0 = v0, cnd C M I + and x1 = v1, cnd C M I + (p7) add w2 = INCR, w2 C M I + ;; +}{.mmi; ADDSUB w0 = u0, x0 C M I + (p8) cmpeqor p9, p0 = LIM, w3 C M I + (p8) add w3 = INCR, w3 C M I +}{.mmb; ADDSUB w1 = u1, x1 C M I + lfetch [vpadv], 32 + br.cloop.dptk L(top) C B + ;; +} +C *** MAIN LOOP END *** + + +L(end): + {.mmi; st8 [rp1] = w2, 16 C M23 + st8 [rp2] = w3, 16 C M23 + cmp.CND p6, p0 = w0, u0 C M I + ;; +} +L(e2): + {.mmi; cmp.CND p7, p0 = w1, u1 C M I + (p9) cmpeqor p6, p0 = LIM, w0 C M I + (p9) add w0 = INCR, w0 C M I + ;; +}{.mmi; mov r8 = 0 C M I + (p6) cmpeqor p7, p0 = LIM, w1 C M I + (p6) add w1 = INCR, w1 C M I + ;; +}{.mmi; st8 [rp1] = w0, 16 C M23 + st8 [rp2] = w1, 16 C M23 + mov ar.lc = r2 C I0 +} +L(e1): + {.mmb; nop 0 + (p7) mov r8 = 1 C M I + br.ret.sptk.many b0 C B +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/copyd.asm b/gmp-6.3.0/mpn/ia64/copyd.asm new file mode 100644 index 0000000..b94a1af --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/copyd.asm @@ -0,0 +1,186 @@ +dnl IA-64 mpn_copyd -- copy limb vector, decrementing. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 1 +C Itanium 2: 0.5 + +C INPUT PARAMETERS +C rp = r32 +C sp = r33 +C n = r34 + +ASM_START() +PROLOGUE(mpn_copyd) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 + addp4 r33 = 0, r33 + sxt4 r34 = r34 + ;; +') +{.mmi + shladd r32 = r34, 3, r32 + shladd r33 = r34, 3, r33 + mov.i r2 = ar.lc +} +{.mmi + and r14 = 3, r34 + cmp.ge p14, p15 = 3, r34 + add r34 = -4, r34 + ;; +} +{.mmi + cmp.eq p8, p0 = 1, r14 + cmp.eq p10, p0 = 2, r14 + cmp.eq p12, p0 = 3, r14 +} +{.bbb + (p8) br.dptk .Lb01 + (p10) br.dptk .Lb10 + (p12) br.dptk .Lb11 +} + +.Lb00: C n = 0, 4, 8, 12, ... + add r32 = -8, r32 + add r33 = -8, r33 + (p14) br.dptk .Ls00 + ;; + add r21 = -8, r33 + ld8 r16 = [r33], -16 + shr r15 = r34, 2 + ;; + ld8 r17 = [r21], -16 + mov.i ar.lc = r15 + ld8 r18 = [r33], -16 + add r20 = -8, r32 + ;; + ld8 r19 = [r21], -16 + br.cloop.dptk .Loop + ;; + br.sptk .Lend + ;; + +.Lb01: C n = 1, 5, 9, 13, ... + add r21 = -8, r33 + add r20 = -8, r32 + add r33 = -16, r33 + add r32 = -16, r32 + ;; + ld8 r19 = [r21], -16 + shr r15 = r34, 2 + (p14) br.dptk .Ls01 + ;; + ld8 r16 = [r33], -16 + mov.i ar.lc = r15 + ;; + ld8 r17 = [r21], -16 + ld8 r18 = [r33], -16 + br.sptk .Li01 + ;; + +.Lb10: C n = 2,6, 10, 14, ... + add r21 = -16, r33 + shr r15 = r34, 2 + add r20 = -16, r32 + add r32 = -8, r32 + add r33 = -8, r33 + ;; + ld8 r18 = [r33], -16 + ld8 r19 = [r21], -16 + mov.i ar.lc = r15 + (p14) br.dptk .Ls10 + ;; + ld8 r16 = [r33], -16 + ld8 r17 = [r21], -16 + br.sptk .Li10 + ;; + +.Lb11: C n = 3, 7, 11, 15, ... + add r21 = -8, r33 + add r20 = -8, r32 + add r33 = -16, r33 + add r32 = -16, r32 + ;; + ld8 r17 = [r21], -16 + shr r15 = r34, 2 + ;; + ld8 r18 = [r33], -16 + mov.i ar.lc = r15 + ld8 r19 = [r21], -16 + (p14) br.dptk .Ls11 + ;; + ld8 r16 = [r33], -16 + br.sptk .Li11 + ;; + + ALIGN(32) +.Loop: +.Li00: +{.mmb + st8 [r32] = r16, -16 + ld8 r16 = [r33], -16 + nop.b 0 +} +.Li11: +{.mmb + st8 [r20] = r17, -16 + ld8 r17 = [r21], -16 + nop.b 0 + ;; +} +.Li10: +{.mmb + st8 [r32] = r18, -16 + ld8 r18 = [r33], -16 + nop.b 0 +} +.Li01: +{.mmb + st8 [r20] = r19, -16 + ld8 r19 = [r21], -16 + br.cloop.dptk .Loop + ;; +} +.Lend: st8 [r32] = r16, -16 +.Ls11: st8 [r20] = r17, -16 + ;; +.Ls10: st8 [r32] = r18, -16 +.Ls01: st8 [r20] = r19, -16 +.Ls00: mov.i ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/copyi.asm b/gmp-6.3.0/mpn/ia64/copyi.asm new file mode 100644 index 0000000..49ed192 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/copyi.asm @@ -0,0 +1,182 @@ +dnl IA-64 mpn_copyi -- copy limb vector, incrementing. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 1 +C Itanium 2: 0.5 + +C INPUT PARAMETERS +C rp = r32 +C sp = r33 +C n = r34 + +ASM_START() +PROLOGUE(mpn_copyi) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 + addp4 r33 = 0, r33 + sxt4 r34 = r34 + ;; +') +{.mmi + nop 0 + nop 0 + mov.i r2 = ar.lc +} +{.mmi + and r14 = 3, r34 + cmp.ge p14, p15 = 3, r34 + add r34 = -4, r34 + ;; +} +{.mmi + cmp.eq p8, p0 = 1, r14 + cmp.eq p10, p0 = 2, r14 + cmp.eq p12, p0 = 3, r14 +} +{.bbb + (p8) br.dptk .Lb01 + (p10) br.dptk .Lb10 + (p12) br.dptk .Lb11 +} + +.Lb00: C n = 0, 4, 8, 12, ... + (p14) br.dptk .Ls00 + ;; + add r21 = 8, r33 + ld8 r16 = [r33], 16 + shr r15 = r34, 2 + ;; + ld8 r17 = [r21], 16 + mov.i ar.lc = r15 + ld8 r18 = [r33], 16 + add r20 = 8, r32 + ;; + ld8 r19 = [r21], 16 + br.cloop.dptk .Loop + ;; + br.sptk .Lend + ;; + +.Lb01: C n = 1, 5, 9, 13, ... + add r21 = 0, r33 + add r20 = 0, r32 + add r33 = 8, r33 + add r32 = 8, r32 + ;; + ld8 r19 = [r21], 16 + shr r15 = r34, 2 + (p14) br.dptk .Ls01 + ;; + ld8 r16 = [r33], 16 + mov.i ar.lc = r15 + ;; + ld8 r17 = [r21], 16 + ld8 r18 = [r33], 16 + br.sptk .Li01 + ;; + +.Lb10: C n = 2,6, 10, 14, ... + add r21 = 8, r33 + add r20 = 8, r32 + ld8 r18 = [r33], 16 + shr r15 = r34, 2 + ;; + ld8 r19 = [r21], 16 + mov.i ar.lc = r15 + (p14) br.dptk .Ls10 + ;; + ld8 r16 = [r33], 16 + ld8 r17 = [r21], 16 + br.sptk .Li10 + ;; + +.Lb11: C n = 3, 7, 11, 15, ... + add r21 = 0, r33 + add r20 = 0, r32 + add r33 = 8, r33 + add r32 = 8, r32 + ;; + ld8 r17 = [r21], 16 + shr r15 = r34, 2 + ;; + ld8 r18 = [r33], 16 + mov.i ar.lc = r15 + ld8 r19 = [r21], 16 + (p14) br.dptk .Ls11 + ;; + ld8 r16 = [r33], 16 + br.sptk .Li11 + ;; + + ALIGN(32) +.Loop: +.Li00: +{.mmb + st8 [r32] = r16, 16 + ld8 r16 = [r33], 16 + nop.b 0 +} +.Li11: +{.mmb + st8 [r20] = r17, 16 + ld8 r17 = [r21], 16 + nop.b 0 + ;; +} +.Li10: +{.mmb + st8 [r32] = r18, 16 + ld8 r18 = [r33], 16 + nop.b 0 +} +.Li01: +{.mmb + st8 [r20] = r19, 16 + ld8 r19 = [r21], 16 + br.cloop.dptk .Loop + ;; +} +.Lend: st8 [r32] = r16, 16 +.Ls11: st8 [r20] = r17, 16 + ;; +.Ls10: st8 [r32] = r18, 16 +.Ls01: st8 [r20] = r19, 16 +.Ls00: mov.i ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/dive_1.asm b/gmp-6.3.0/mpn/ia64/dive_1.asm new file mode 100644 index 0000000..5e4a273 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/dive_1.asm @@ -0,0 +1,236 @@ +dnl IA-64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde. + +dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 16 +C Itanium 2: 8 + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`divisor', `r35') + +define(`lshift', `r24') +define(`rshift', `r25') + +C This code is a bit messy, and not as similar to mode1o.asm as desired. + +C The critical path during initialization is for computing the inverse of the +C divisor. Since odd divisors are probably common, we conditionally execute +C the initial count_trailing_zeros code and the downshift. + +C Possible improvement: Merge more of the feed-in code into the inverse +C computation. + +ASM_START() + .text + .align 32 +.Ltab: +data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF +data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF +data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF +data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF +data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF +data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F +data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F +data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F +data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F +data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F +data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F +data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F +data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F +data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F +data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F +data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF + + +PROLOGUE(mpn_divexact_1) + .prologue + .save ar.lc, r2 + .body + + {.mmi; add r8 = -1, divisor C M0 + nop 0 C M1 + tbit.z p8, p9 = divisor, 0 C I0 +} +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M2 rp extend + addp4 up = 0, up C M3 up extend + sxt4 n = n') C I1 size extend + ;; +.Lhere: + {.mmi; ld8 r20 = [up], 8 C M0 up[0] + (p8) andcm r8 = r8, divisor C M1 + mov r15 = ip C I0 .Lhere + ;; +}{.mii + .pred.rel "mutex", p8, p9 + (p9) mov rshift = 0 C M0 + (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor) + cmp.eq p6, p10 = 1, n C I1 + ;; +}{.mii; add r9 = .Ltab-.Lhere, r15 C M0 + (p8) shr.u divisor = divisor, rshift C I0 + nop 0 C I1 + ;; +}{.mmi; add n = -4, n C M0 size-1 + (p10) ld8 r21 = [up], 8 C M1 up[1] + mov r14 = 2 C M1 2 +}{.mfi; setf.sig f6 = divisor C M2 divisor + mov f9 = f0 C M3 carry FIXME + zxt1 r3 = divisor C I1 divisor low byte + ;; +}{.mmi; add r3 = r9, r3 C M0 table offset ip and index + sub r16 = 0, divisor C M1 -divisor + mov r2 = ar.lc C I0 +}{.mmi; sub lshift = 64, rshift C M2 + setf.sig f13 = r14 C M3 2 in significand + mov r17 = -1 C I1 -1 + ;; +}{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits + nop 0 C M1 + mov ar.lc = n C I0 size-1 loop count +}{.mmi; setf.sig f12 = r16 C M2 -divisor + setf.sig f8 = r17 C M3 -1 + cmp.eq p7, p0 = -2, n C I1 + ;; +}{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits + cmp.eq p8, p0 = -1, n C M0 + shr.u r23 = r20, rshift C I0 + ;; +} + + C f6 divisor + C f7 inverse, being calculated + C f8 -1, will be -inverse + C f9 carry + C f12 -divisor + C f13 2 + C f14 scratch + + xmpy.l f14 = f13, f7 C Newton 2*i + xmpy.l f7 = f7, f7 C Newton i*i + ;; + xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits + ;; + setf.sig f10 = r23 C speculative, used iff n = 1 + xmpy.l f14 = f13, f7 C Newton 2*i + shl r22 = r21, lshift C speculative, used iff n > 1 + xmpy.l f7 = f7, f7 C Newton i*i + ;; + or r31 = r22, r23 C speculative, used iff n > 1 + xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits + shr.u r23 = r21, rshift C speculative, used iff n > 1 + ;; + setf.sig f11 = r31 C speculative, used iff n > 1 + xmpy.l f14 = f13, f7 C Newton 2*i + xmpy.l f7 = f7, f7 C Newton i*i + ;; + xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits + + (p7) br.cond.dptk .Ln2 + (p10) br.cond.dptk .grt3 + ;; + +.Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse + br .Lx1 + +.Ln2: + xmpy.l f8 = f7, f8 C -inverse = inverse * -1 + xmpy.l f12 = f11, f7 C q = ulimb * inverse + setf.sig f11 = r23 + br .Lx2 + +.grt3: + ld8 r21 = [up], 8 C up[2] + xmpy.l f8 = f7, f8 C -inverse = inverse * -1 + ;; + shl r22 = r21, lshift + ;; + xmpy.l f12 = f11, f7 C q = ulimb * inverse + ;; + or r31 = r22, r23 + shr.u r23 = r21, rshift + ;; + setf.sig f11 = r31 + (p8) br.cond.dptk .Lx3 C branch for n = 3 + ;; + ld8 r21 = [up], 8 + br .Lent + +.Ltop: ld8 r21 = [up], 8 + xma.l f12 = f9, f8, f10 C q = c * -inverse + si + nop.b 0 + ;; +.Lent: add r16 = 160, up + shl r22 = r21, lshift + nop.b 0 + ;; + stf8 [rp] = f12, 8 + xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) + nop.b 0 + nop.m 0 + xmpy.l f10 = f11, f7 C si = ulimb * inverse + nop.b 0 + ;; + or r31 = r22, r23 + shr.u r23 = r21, rshift + nop.b 0 + ;; + lfetch [r16] + setf.sig f11 = r31 + br.cloop.sptk.few.clr .Ltop + + + xma.l f12 = f9, f8, f10 C q = c * -inverse + si + ;; +.Lx3: stf8 [rp] = f12, 8 + xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) + xmpy.l f10 = f11, f7 C si = ulimb * inverse + ;; + setf.sig f11 = r23 + ;; + xma.l f12 = f9, f8, f10 C q = c * -inverse + si + ;; +.Lx2: stf8 [rp] = f12, 8 + xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c) + xmpy.l f10 = f11, f7 C si = ulimb * inverse + ;; + xma.l f12 = f9, f8, f10 C q = c * -inverse + si + ;; +.Lx1: stf8 [rp] = f12, 8 + mov ar.lc = r2 C I0 + br.ret.sptk.many b0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/divrem_1.asm b/gmp-6.3.0/mpn/ia64/divrem_1.asm new file mode 100644 index 0000000..e887820 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/divrem_1.asm @@ -0,0 +1,477 @@ +dnl IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an +dnl unnormalized limb. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2002, 2004, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Itanium: 40-42 +C Itanium 2: 29-30 + +C This was generated by gcc, then the loops were optimized. The preinv entry +C point was shoehorned into the file. Lots of things outside the loops could +C be streamlined. It would probably be a good idea to merge the loops for +C normalized and unnormalized divisor, since the shifting stuff is done for +C free in parallel with other operations. It would even be possible to merge +C all loops, if the ld8 were made conditional. + +C TODO +C * Consider delaying inversion for normalized mpn_divrem_1 entry till after +C computing leading limb. +C * Inline and interleave limb inversion code with loop setup code. + +ASM_START() + +C HP's assembler requires these declarations for importing mpn_invert_limb + .global mpn_invert_limb + .type mpn_invert_limb,@function + +C INPUT PARAMETERS +C rp = r32 +C qxn = r33 +C up = r34 +C n = r35 +C vl = r36 +C vlinv = r37 (preinv only) +C cnt = r38 (preinv only) + +PROLOGUE(mpn_preinv_divrem_1) + .prologue + .save ar.pfs, r42 + alloc r42 = ar.pfs, 7, 8, 1, 0 + .save ar.lc, r44 + mov r44 = ar.lc + .save rp, r41 + mov r41 = b0 + .body +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 + sxt4 r33 = r33 + addp4 r34 = 0, r34 + sxt4 r35 = r35 + ;; +') + mov r40 = r38 + shladd r34 = r35, 3, r34 + ;; + adds r34 = -8, r34 + ;; + ld8 r39 = [r34], -8 + ;; + + add r15 = r35, r33 + ;; + mov r8 = r37 + shladd r32 = r15, 3, r32 C r32 = rp + n + qxn + cmp.le p8, p0 = 0, r36 + ;; + adds r32 = -8, r32 C r32 = rp + n + qxn - 1 + cmp.leu p6, p7 = r36, r39 + (p8) br.cond.dpnt .Lpunnorm + ;; + + (p6) addl r15 = 1, r0 + (p7) mov r15 = r0 + ;; + (p6) sub r38 = r39, r36 + (p7) mov r38 = r39 + st8 [r32] = r15, -8 + adds r35 = -2, r35 C un -= 2 + br .Lpn + +.Lpunnorm: + (p6) add r34 = 8, r34 + mov r38 = 0 C r = 0 + shl r36 = r36, r40 + (p6) br.cond.dptk .Lpu + ;; + shl r38 = r39, r40 C r = ahigh << cnt + cmp.ne p8, p0 = 1, r35 + st8 [r32] = r0, -8 + adds r35 = -1, r35 C un-- + (p8) br.cond.dpnt .Lpu + + mov r23 = 1 + ;; + setf.sig f6 = r8 + setf.sig f12 = r23 + br .L435 +EPILOGUE() + + +PROLOGUE(mpn_divrem_1) + .prologue + .save ar.pfs, r42 + alloc r42 = ar.pfs, 5, 8, 1, 0 + .save ar.lc, r44 + mov r44 = ar.lc + .save rp, r41 + mov r41 = b0 + .body +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 + sxt4 r33 = r33 + addp4 r34 = 0, r34 + sxt4 r35 = r35 + ;; +') + mov r38 = r0 + add r15 = r35, r33 + ;; + cmp.ne p6, p7 = 0, r15 + ;; + (p7) mov r8 = r0 + (p7) br.cond.dpnt .Lret + shladd r14 = r15, 3, r32 C r14 = rp + n + qxn + cmp.le p6, p7 = 0, r36 + ;; + adds r32 = -8, r14 C r32 = rp + n + qxn - 1 + (p6) br.cond.dpnt .Lunnorm + cmp.eq p6, p7 = 0, r35 + (p6) br.cond.dpnt .L179 + shladd r14 = r35, 3, r34 + ;; + adds r14 = -8, r14 + adds r35 = -1, r35 + ;; + ld8 r38 = [r14] + ;; + cmp.leu p6, p7 = r36, r38 + ;; + (p6) addl r15 = 1, r0 + (p7) mov r15 = r0 + ;; + st8 [r32] = r15, -8 + (p6) sub r38 = r38, r36 + +.L179: + mov r45 = r36 + adds r35 = -1, r35 + br.call.sptk.many b0 = mpn_invert_limb + ;; + shladd r34 = r35, 3, r34 +.Lpn: + mov r23 = 1 + ;; + setf.sig f6 = r8 + setf.sig f12 = r23 + cmp.le p6, p7 = 0, r35 + mov r40 = 0 + (p7) br.cond.dpnt .L435 + setf.sig f10 = r36 + mov ar.lc = r35 + setf.sig f7 = r38 + ;; + sub r28 = -1, r36 +C Develop quotient limbs for normalized divisor +.Loop1: C 00 C q=r18 nh=r38/f7 + ld8 r20 = [r34], -8 + xma.hu f11 = f7, f6, f0 + ;; C 04 + xma.l f8 = f11, f12, f7 C q = q + nh + ;; C 08 + getf.sig r18 = f8 + xma.hu f9 = f8, f10, f0 + xma.l f8 = f8, f10, f0 + ;; C 12 + getf.sig r16 = f9 + C 13 + getf.sig r15 = f8 + ;; C 18 + cmp.ltu p6, p7 = r20, r15 + sub r15 = r20, r15 + sub r16 = r38, r16 + ;; C 19 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? + (p6) add r16 = -1, r16 + (p0) cmp.ne.unc p6, p7 = r0, r0 + ;; C 20 + (p8) cmp.ltu p6, p7 = r15, r36 + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; C 21 + .pred.rel "mutex",p6,p7 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? + cmp.ltu p6, p7 = r15, r36 C speculative + sub r28 = r15, r36 C speculative, just for cmp + ;; C 22 + (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed + (p8) mov r15 = r28 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; C 23 + (p6) setf.sig f7 = r15 + (p7) sub r15 = r15, r36 + (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; C 24 + (p7) setf.sig f7 = r15 + st8 [r32] = r18, -8 + mov r38 = r15 + br.cloop.dptk .Loop1 + C 29/30 + br.sptk .L435 + ;; +.Lunnorm: + mux1 r16 = r36, @rev + cmp.eq p6, p7 = 0, r35 + (p6) br.cond.dpnt .L322 + shladd r34 = r35, 3, r34 + ;; + adds r34 = -8, r34 + ;; + ld8 r39 = [r34] + ;; + cmp.leu p6, p7 = r36, r39 + (p6) br.cond.dptk .L322 + adds r34 = -8, r34 + ;; + mov r38 = r39 + ;; + cmp.ne p6, p7 = 1, r15 + st8 [r32] = r0, -8 + ;; + (p7) mov r8 = r38 + (p7) br.cond.dpnt .Lret + adds r35 = -1, r35 +.L322: + sub r14 = r0, r16 + ;; + or r14 = r16, r14 + ;; + mov r16 = -8 + czx1.l r14 = r14 + ;; + shladd r16 = r14, 3, r16 + ;; + shr.u r14 = r36, r16 + ;; + cmp.geu p6, p7 = 15, r14 + ;; + (p7) shr.u r14 = r14, 4 + (p7) adds r16 = 4, r16 + ;; + cmp.geu p6, p7 = 3, r14 + ;; + (p7) shr.u r14 = r14, 2 + (p7) adds r16 = 2, r16 + ;; + tbit.nz p6, p7 = r14, 1 + ;; + .pred.rel "mutex",p6,p7 + (p6) sub r40 = 62, r16 + (p7) sub r40 = 63, r16 + ;; + shl r45 = r36, r40 + shl r36 = r36, r40 + shl r38 = r38, r40 + br.call.sptk.many b0 = mpn_invert_limb + ;; +.Lpu: + mov r23 = 1 + ;; + setf.sig f6 = r8 + setf.sig f12 = r23 + cmp.eq p6, p7 = 0, r35 + (p6) br.cond.dpnt .L435 + sub r16 = 64, r40 + adds r35 = -2, r35 + ;; + ld8 r39 = [r34], -8 + cmp.le p6, p7 = 0, r35 + ;; + shr.u r14 = r39, r16 + ;; + or r38 = r14, r38 + (p7) br.cond.dpnt .Lend3 + ;; + mov r22 = r16 + setf.sig f10 = r36 + setf.sig f7 = r38 + mov ar.lc = r35 + ;; +C Develop quotient limbs for unnormalized divisor +.Loop3: + ld8 r14 = [r34], -8 + xma.hu f11 = f7, f6, f0 + ;; + xma.l f8 = f11, f12, f7 C q = q + nh + ;; + getf.sig r18 = f8 + xma.hu f9 = f8, f10, f0 + shl r20 = r39, r40 + xma.l f8 = f8, f10, f0 + shr.u r24 = r14, r22 + ;; + getf.sig r16 = f9 + getf.sig r15 = f8 + or r20 = r24, r20 + ;; + cmp.ltu p6, p7 = r20, r15 + sub r15 = r20, r15 + sub r16 = r38, r16 + ;; + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? + (p6) add r16 = -1, r16 + (p0) cmp.ne.unc p6, p7 = r0, r0 + ;; + (p8) cmp.ltu p6, p7 = r15, r36 + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? + cmp.ltu p6, p7 = r15, r36 C speculative + sub r28 = r15, r36 C speculative, just for cmp + ;; + (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed + (p8) mov r15 = r28 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + (p6) setf.sig f7 = r15 + (p7) sub r15 = r15, r36 + (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + (p7) setf.sig f7 = r15 + st8 [r32] = r18, -8 + mov r39 = r14 + mov r38 = r15 + br.cloop.dptk .Loop3 + ;; +.Lend3: + setf.sig f10 = r36 + setf.sig f7 = r38 + ;; + xma.hu f11 = f7, f6, f0 + ;; + xma.l f8 = f11, f12, f7 C q = q + nh + ;; + getf.sig r18 = f8 + xma.hu f9 = f8, f10, f0 + shl r20 = r39, r40 + xma.l f8 = f8, f10, f0 + ;; + getf.sig r16 = f9 + getf.sig r15 = f8 + ;; + cmp.ltu p6, p7 = r20, r15 + sub r15 = r20, r15 + sub r16 = r38, r16 + ;; + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? + (p6) add r16 = -1, r16 + (p0) cmp.ne.unc p6, p7 = r0, r0 + ;; + (p8) cmp.ltu p6, p7 = r15, r36 + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? + ;; + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + cmp.ltu p6, p7 = r15, r36 + ;; + (p7) sub r15 = r15, r36 + (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + st8 [r32] = r18, -8 + mov r38 = r15 +.L435: + adds r35 = -1, r33 + cmp.le p6, p7 = 1, r33 + (p7) br.cond.dpnt .Lend4 + ;; + setf.sig f7 = r38 + setf.sig f10 = r36 + mov ar.lc = r35 + ;; +.Loop4: + xma.hu f11 = f7, f6, f0 + ;; + xma.l f8 = f11, f12, f7 C q = q + nh + ;; + getf.sig r18 = f8 + xma.hu f9 = f8, f10, f0 + xma.l f8 = f8, f10, f0 + ;; + getf.sig r16 = f9 + getf.sig r15 = f8 + ;; + cmp.ltu p6, p7 = 0, r15 + sub r15 = 0, r15 + sub r16 = r38, r16 + ;; + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0? + (p6) add r16 = -1, r16 + (p0) cmp.ne.unc p6, p7 = r0, r0 + ;; + (p8) cmp.ltu p6, p7 = r15, r36 + (p8) sub r15 = r15, r36 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.ne p8, p9 = 1, r16 C is rH != 0 still? + (p7) cmp.ne p8, p9 = 0, r16 C is rH != 0 still? + cmp.ltu p6, p7 = r15, r36 C speculative + sub r28 = r15, r36 C speculative, just for cmp + ;; + (p8) cmp.ltu p6, p7 = r28, r36 C redo last cmp if needed + (p8) mov r15 = r28 + (p8) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + (p6) setf.sig f7 = r15 + (p7) sub r15 = r15, r36 + (p7) add r18 = 1, r18 C q = q + 1; done if: rH > 0 + ;; + (p7) setf.sig f7 = r15 + st8 [r32] = r18, -8 + mov r38 = r15 + br.cloop.dptk .Loop4 + ;; +.Lend4: + shr.u r8 = r38, r40 +.Lret: + mov ar.pfs = r42 + mov ar.lc = r44 + mov b0 = r41 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/divrem_2.asm b/gmp-6.3.0/mpn/ia64/divrem_2.asm new file mode 100644 index 0000000..9864311 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/divrem_2.asm @@ -0,0 +1,280 @@ +dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C norm frac +C itanium 1 +C itanium 2 29 29 + + +C TODO +C * Inline and interleave limb inversion code with loop setup code. +C * We should use explicit bundling in much of the code, since it typically +C cuts some cycles with the GNU assembler. + + +ASM_START() + +C HP's assembler requires these declarations for importing mpn_invert_limb + .global mpn_invert_limb + .type mpn_invert_limb,@function + +C INPUT PARAMETERS +C qp = r32 +C fn = r33 +C np = r34 +C nn = r35 +C dp = r36 + +define(`f0x1', `f15') + +ASM_START() +PROLOGUE(mpn_divrem_2) + .prologue +ifdef(`HAVE_ABI_32', +` addp4 r32 = 0, r32 C M I + addp4 r34 = 0, r34 C M I + zxt4 r35 = r35 C I + addp4 r36 = 0, r36 C M I + nop.m 0 + zxt4 r33 = r33 C I + ;; +') + .save ar.pfs, r42 + alloc r42 = ar.pfs, 5, 9, 1, 0 + shladd r34 = r35, 3, r34 + adds r14 = 8, r36 + mov r43 = r1 + ;; + adds r15 = -8, r34 + ld8 r39 = [r14] + .save ar.lc, r45 + mov r45 = ar.lc + adds r14 = -16, r34 + mov r40 = r0 + adds r34 = -24, r34 + ;; + ld8 r38 = [r15] + .save rp, r41 + mov r41 = b0 + .body + ld8 r36 = [r36] + ld8 r37 = [r14] + ;; + cmp.gtu p6, p7 = r39, r38 + (p6) br.cond.dptk .L8 + ;; + cmp.leu p8, p9 = r36, r37 + cmp.geu p6, p7 = r39, r38 + ;; + (p8) cmp4.ne.and.orcm p6, p7 = 0, r0 + (p7) br.cond.dptk .L51 +.L8: + add r14 = r33, r35 // un + fn + mov r46 = r39 // argument to mpn_invert_limb + ;; + adds r35 = -3, r14 + ;; + cmp.gt p12, p0 = r0, r35 + (p12) br.cond.dpnt L(end) + br.call.sptk.many b0 = mpn_invert_limb + ;; + setf.sig f11 = r8 // di (non-final) + setf.sig f34 = r39 // d1 + setf.sig f33 = r36 // d0 + mov r1 = r43 + ;; + mov r17 = 1 + setf.sig f9 = r38 // n2 + xma.l f6 = f11, f34, f0 // t0 = LO(di * d1) + ;; + setf.sig f10 = r37 // n1 + setf.sig f15 = r17 // 1 + xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0) + ;; + getf.sig r17 = f6 + getf.sig r16 = f8 + mov ar.lc = r35 + ;; + sub r18 = r0, r39 // -d1 + add r14 = r17, r36 + ;; + setf.sig f14 = r18 // -d1 + cmp.leu p8, p9 = r17, r14 + add r16 = r14, r16 + ;; + (p9) adds r19 = 0, r0 + (p8) adds r19 = -1, r0 + cmp.gtu p6, p7 = r14, r16 + ;; + (p6) adds r19 = 1, r19 + ;; +ifelse(1,1,` + cmp.gt p7, p6 = r0, r19 + ;; + (p6) adds r8 = -1, r8 // di-- + (p6) sub r14 = r16, r39 // t0 -= d1 + (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1 + ;; + (p6) cmp.gt p9, p8 = 1, r19 + (p7) cmp.gt p9, p8 = 0, r19 + (p6) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p8) adds r8 = -1, r8 // di-- + (p8) sub r14 = r16, r39 // t0 -= d1 + (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1 + ;; + (p8) cmp.gt p7, p6 = 1, r19 + (p9) cmp.gt p7, p6 = 0, r19 + (p8) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p6) adds r8 = -1, r8 // di-- + (p6) sub r14 = r16, r39 // t0 -= d1 + (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1 + ;; + (p6) cmp.gt p9, p8 = 1, r19 + (p7) cmp.gt p9, p8 = 0, r19 + (p6) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 + ;; + (p8) adds r8 = -1, r8 // di-- + (p8) sub r14 = r16, r39 // t0 -= d1 + (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1 + ;; + (p8) adds r19 = -1, r19 // t1 -= cy + mov r16 = r14 +',` + cmp.gt p8, p9 = r0, r19 + (p8) br.cond.dpnt .L46 +.L52: + cmp.leu p6, p7 = r39, r16 + sub r14 = r16, r39 + adds r8 = -1, r8 + ;; + (p7) adds r19 = -1, r19 + mov r16 = r14 + ;; + (p7) cmp.gt p8, p9 = r0, r19 + (p9) br.cond.dptk .L52 +.L46: +') + setf.sig f32 = r8 // di + shladd r32 = r35, 3, r32 + ;; + + ALIGN(16) +L(top): nop 0 + nop 0 + cmp.gt p8, p9 = r33, r35 + ;; + (p8) mov r37 = r0 + (p9) ld8 r37 = [r34], -8 + xma.hu f8 = f9, f32, f10 // 0,29 + xma.l f12 = f9, f32, f10 // 0 + ;; + getf.sig r20 = f12 // q0 4 + xma.l f13 = f15, f8, f9 // q += n2 4 + sub r8 = -1, r36 // bitnot d0 + ;; + getf.sig r18 = f13 // 8 + xma.l f7 = f14, f13, f10 // 8 + xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8 + xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9 + ;; + getf.sig r38 = f7 // n1 12 + getf.sig r16 = f6 // 13 + getf.sig r19 = f9 // 14 + ;; + sub r38 = r38, r39 // n1 -= d1 17 + ;; + cmp.ne p9, p0 = r0, r0 // clear p9 + cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18 + ;; + sub r37 = r37, r16 // n0 -= t0 19 + (p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19 + (p10) sub r38 = r38, r19 // n1 -= t1 19 + ;; + cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20 + ;; + (p7) cmp.ltu p9, p0 = r8, r37 // 21 + (p6) add r18 = 1, r18 // + (p7) add r37 = r37, r36 // 21 + (p7) add r38 = r38, r39 // 21 + ;; + setf.sig f10 = r37 // n1 22 + (p9) add r38 = 1, r38 // 22 + ;; + setf.sig f9 = r38 // n2 23 + cmp.gtu p6, p7 = r39, r38 // 23 + (p7) br.cond.spnt L(fix) +L(bck): st8 [r32] = r18, -8 + adds r35 = -1, r35 + br.cloop.sptk.few L(top) + ;; + +L(end): add r14 = 8, r34 + add r15 = 16, r34 + mov b0 = r41 + ;; + st8 [r14] = r37 + st8 [r15] = r38 + mov ar.pfs = r42 + mov r8 = r40 + mov ar.lc = r45 + br.ret.sptk.many b0 + ;; +.L51: + .pred.rel "mutex", p8, p9 + sub r37 = r37, r36 + (p9) sub r38 = r38, r39, 1 + (p8) sub r38 = r38, r39 + adds r40 = 1, r0 + br .L8 + ;; + +L(fix): cmp.geu p6, p7 = r39, r38 + cmp.leu p8, p9 = r36, r37 + ;; + (p8) cmp4.ne.and.orcm p6, p7 = 0, r0 + (p6) br.cond.dptk L(bck) + sub r37 = r37, r36 + (p9) sub r38 = r38, r39, 1 + (p8) sub r38 = r38, r39 + adds r18 = 1, r18 + ;; + setf.sig f9 = r38 // n2 + setf.sig f10 = r37 // n1 + br L(bck) + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/gcd_11.asm b/gmp-6.3.0/mpn/ia64/gcd_11.asm new file mode 100644 index 0000000..6137227 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/gcd_11.asm @@ -0,0 +1,110 @@ +dnl Itanium-2 mpn_gcd_11 + +dnl Copyright 2002-2005, 2012, 2013, 2015, 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bitpair (1x1 gcd) +C Itanium: ? +C Itanium 2: 4.5 + + +ASM_START() + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + + .rodata + ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep +ctz_table: + data1 MAXSHIFT +forloop(i,1,MASK, +` data1 m4_count_trailing_zeros(i)-1 +') + +define(`x0', r32) +define(`y0', r33) + +PROLOGUE(mpn_gcd_11) + .prologue + .body + addl r22 = @ltoff(ctz_table), r1 + ;; + ld8 r22 = [r22] + br L(ent) + ;; + + ALIGN(32) +L(top): + .pred.rel "mutex", p6,p7 + {.mmi; (p7) mov y0 = x0 + (p6) sub x0 = x0, y0 + dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits) +}{.mmi; and r20 = MASK, r19 + (p7) mov x0 = r19 + and r23 = 6, r19 + ;; +}{.mmi; cmp.eq p6,p0 = 4, r23 + cmp.eq p7,p0 = 0, r23 + shr.u x0 = x0, 1 C shift-by-1, always OK +}{.mmb; ld1 r16 = [r21] + cmp.eq p10,p0 = 0, r20 + (p10) br.spnt.few.clr L(count_better) + ;; +} +L(bck): + .pred.rel "mutex", p6,p7 + {.mii; nop 0 + (p6) shr.u x0 = x0, 1 C u was ...100 before shift-by-1 above + (p7) shr.u x0 = x0, r16 C u was ...000 before shift-by-1 above + ;; +} +L(ent): + {.mmi; sub r19 = y0, x0 + cmp.gtu p6,p7 = x0, y0 + cmp.ne p8,p0 = x0, y0 +}{.mmb; nop 0 + nop 0 + (p8) br.sptk.few.clr L(top) +} + +L(end): mov r8 = y0 + br.ret.sptk.many b0 + +L(count_better): + add r20 = -1, x0 + ;; + andcm r23 = r20, x0 + ;; + popcnt r16 = r23 + br L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/gmp-mparam.h b/gmp-6.3.0/mpn/ia64/gmp-mparam.h new file mode 100644 index 0000000..34d2bf3 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/gmp-mparam.h @@ -0,0 +1,212 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 900MHz Itanium2 (olympic.gmplib.org) */ +/* FFT tuning limit = 59,194,709 */ +/* Generated by tuneup.c, 2019-10-13, gcc 4.2 */ + +#define MOD_1_1P_METHOD 2 /* 17.40% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 1.35% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 10 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define DIV_1_VS_MUL_1_PERCENT 316 + +#define MUL_TOOM22_THRESHOLD 47 +#define MUL_TOOM33_THRESHOLD 89 +#define MUL_TOOM44_THRESHOLD 220 +#define MUL_TOOM6H_THRESHOLD 327 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 143 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 153 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226 + +#define SQR_BASECASE_THRESHOLD 11 +#define SQR_TOOM2_THRESHOLD 98 +#define SQR_TOOM3_THRESHOLD 135 +#define SQR_TOOM4_THRESHOLD 272 +#define SQR_TOOM6_THRESHOLD 354 +#define SQR_TOOM8_THRESHOLD 490 + +#define MULMID_TOOM42_THRESHOLD 99 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 27 + +#define MUL_FFT_MODF_THRESHOLD 840 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 840, 5}, { 30, 6}, { 16, 5}, { 33, 6}, \ + { 17, 5}, { 36, 6}, { 35, 7}, { 18, 6}, \ + { 37, 7}, { 19, 6}, { 42, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 57, 9}, { 31, 8}, { 63, 9}, { 35, 8}, \ + { 71, 9}, { 43,10}, { 23, 9}, { 55,10}, \ + { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 87,11}, { 47,10}, { 111,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ + { 95,10}, { 191,11}, { 111,12}, { 63,11}, \ + { 143,10}, { 287,11}, { 159,12}, { 95,11}, \ + { 207,13}, { 63,12}, { 127,11}, { 271,12}, \ + { 159,11}, { 335,10}, { 671,12}, { 191,10}, \ + { 799,12}, { 223,13}, { 127,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,13}, { 191,12}, \ + { 383,11}, { 799,10}, { 1599,12}, { 415,11}, \ + { 863,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1119,12}, { 607,13}, { 319,12}, { 735,11}, \ + { 1471,12}, { 863,13}, { 447,12}, { 927,11}, \ + { 1855,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1055,11}, { 2111,12}, { 1119,13}, { 575,12}, \ + { 1247,13}, { 639,12}, { 1311,13}, { 703,12}, \ + { 1471,13}, { 831,12}, { 1727,13}, { 895,12}, \ + { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2239,13}, { 1215,14}, { 639,13}, \ + { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \ + { 1855,12}, { 3711,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2111,12}, { 4223,13}, { 2175,14}, \ + { 1151,13}, { 2495,14}, { 1279,13}, { 2623,14}, \ + { 1407,15}, { 767,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,14}, { 2431,15}, { 1279,14}, { 2943,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 149 +#define MUL_FFT_THRESHOLD 8576 + +#define SQR_FFT_MODF_THRESHOLD 765 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 765, 5}, { 36, 6}, { 37, 7}, { 19, 6}, \ + { 42, 7}, { 43, 8}, { 37, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 49, 9}, { 27, 8}, \ + { 57, 9}, { 43,10}, { 23, 9}, { 55,10}, \ + { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 87,11}, { 47,10}, { 111,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 175,11}, \ + { 95,10}, { 199,11}, { 111,12}, { 63,11}, \ + { 159,12}, { 95,11}, { 191,10}, { 399,11}, \ + { 207,13}, { 63,12}, { 127,10}, { 511, 9}, \ + { 1023,10}, { 527,11}, { 271,12}, { 159,10}, \ + { 703,12}, { 191,11}, { 399,10}, { 799,11}, \ + { 431,12}, { 223,13}, { 127,12}, { 255,11}, \ + { 527,10}, { 1055,11}, { 559,12}, { 287,11}, \ + { 607,10}, { 1215,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 799,12}, { 415,11}, { 863,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1055,12}, { 543,11}, { 1119,12}, { 607,11}, \ + { 1215,12}, { 735,13}, { 383,12}, { 799,11}, \ + { 1599,12}, { 863,13}, { 447,12}, { 991,14}, \ + { 255,13}, { 511,12}, { 1055,11}, { 2111,12}, \ + { 1119,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1311,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1727,13}, \ + { 895,12}, { 1791,13}, { 959,12}, { 1919,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2239,13}, { 1151,12}, { 2303,13}, \ + { 1215,14}, { 639,13}, { 1279,12}, { 2559,13}, \ + { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2239,14}, \ + { 1151,13}, { 2495,14}, { 1279,13}, { 2623,14}, \ + { 1407,15}, { 767,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,14}, { 2431,15}, { 1279,14}, { 2943,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 153 +#define SQR_FFT_THRESHOLD 6272 + +#define MULLO_BASECASE_THRESHOLD 39 +#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */ +#define MULLO_MUL_N_THRESHOLD 17050 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 134 +#define SQRLO_SQR_THRESHOLD 12322 + +#define DC_DIV_QR_THRESHOLD 73 +#define DC_DIVAPPR_Q_THRESHOLD 262 +#define DC_BDIV_QR_THRESHOLD 111 +#define DC_BDIV_Q_THRESHOLD 315 + +#define INV_MULMOD_BNM1_THRESHOLD 92 +#define INV_NEWTON_THRESHOLD 15 +#define INV_APPR_THRESHOLD 17 + +#define BINV_NEWTON_THRESHOLD 280 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ +#define REDC_2_TO_REDC_N_THRESHOLD 172 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1210 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 1566 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define POWM_SEC_TABLE 3,22,139,1867 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 42 +#define SET_STR_DC_THRESHOLD 1339 +#define SET_STR_PRECOMPUTE_THRESHOLD 3934 + +#define FAC_DSC_THRESHOLD 866 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 3 /* 13.73% faster than 1 */ +#define HGCD_THRESHOLD 129 +#define HGCD_APPR_THRESHOLD 202 +#define HGCD_REDUCE_THRESHOLD 4455 +#define GCD_DC_THRESHOLD 658 +#define GCDEXT_DC_THRESHOLD 469 +#define JACOBI_BASE_METHOD 2 /* 0.62% faster than 4 */ + +/* Tuneup completed successfully, took 199042 seconds */ diff --git a/gmp-6.3.0/mpn/ia64/hamdist.asm b/gmp-6.3.0/mpn/ia64/hamdist.asm new file mode 100644 index 0000000..477df4c --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/hamdist.asm @@ -0,0 +1,365 @@ +dnl IA-64 mpn_hamdist -- mpn hamming distance. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2 +C Itanium 2: 1 + +C INPUT PARAMETERS +define(`up', `r32') +define(`vp', `r33') +define(`n', `r34') + +define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') +define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23') +define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27') +define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') +define(`s',`r8') + + +ASM_START() +PROLOGUE(mpn_hamdist) + .prologue +ifdef(`HAVE_ABI_32', +` addp4 up = 0, up C M I + addp4 vp = 0, vp C M I + zxt4 n = n C I + ;; +') + + {.mmi; ld8 r10 = [up], 8 C load first ulimb M01 + ld8 r11 = [vp], 8 C load first vlimb M01 + mov.i r2 = ar.lc C save ar.lc I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p0 = 4, n C small count? M I + add n = -5, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + + +.Lb00: ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + shr.u n = n, 2 C I0 + xor x0 = r10, r11 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + xor x1 = u1, v1 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x2 = u2, v2 C M I + mov s = 0 C M I + (p15) br.cond.dptk .grt4 C B + ;; + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + ;; + popcnt c1 = x1 C I0 + ;; + popcnt c2 = x2 C I0 + br .Lcj4 C B + +.grt4: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + xor x1 = u1, v1 C M I + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + xor x2 = u2, v2 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + popcnt c1 = x1 C I0 + xor x0 = u0, v0 C M I + br.cloop.dpnt .grt8 C B + + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + br .Lcj8 C B + +.grt8: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + br .LL00 C B + + +.Lb01: xor x3 = r10, r11 C M I + shr.u n = n, 2 C I0 + (p15) br.cond.dptk .grt1 C B + ;; + popcnt r8 = x3 C I0 + br.ret.sptk.many b0 C B + +.grt1: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x0 = u0, v0 C M I + br.cloop.dpnt .grt5 C B + + xor x1 = u1, v1 C M I + ;; + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + ;; + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + ;; + popcnt c1 = x1 C I0 + br .Lcj5 C B + +.grt5: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + xor x1 = u1, v1 C M I + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + popcnt c1 = x1 C I0 + xor x0 = u0, v0 C M I + br.cloop.dpnt .Loop C B + br .Lend C B + + +.Lb10: ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x2 = r10, r11 C M I + (p15) br.cond.dptk .grt2 C B + ;; + xor x3 = u3, v3 C M I + ;; + popcnt c2 = x2 C I0 + ;; + popcnt c3 = x3 C I0 + ;; + add s = c2, c3 C M I + br.ret.sptk.many b0 C B + +.grt2: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + xor x3 = u3, v3 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x0 = u0, v0 C M I + br.cloop.dptk .grt6 C B + + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + ;; + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + ;; + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + br .Lcj6 C B + +.grt6: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + popcnt c0 = x0 C I0 + xor x3 = u3, v3 C M I + br .LL10 C B + + +.Lb11: ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + shr.u n = n, 2 C I0 + xor x1 = r10, r11 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + xor x2 = u2, v2 C M I + (p15) br.cond.dptk .grt3 C B + ;; + xor x3 = u3, v3 C M I + ;; + popcnt c1 = x1 C I0 + ;; + popcnt c2 = x2 C I0 + ;; + popcnt c3 = x3 C I0 + ;; + add s = c1, c2 C M I + ;; + add s = s, c3 C M I + br.ret.sptk.many b0 C B + +.grt3: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + xor x3 = u3, v3 C M I + ;; + ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + popcnt c1 = x1 C I0 + xor x0 = u0, v0 C M I + br.cloop.dptk .grt7 C B + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + ;; + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + br .Lcj7 C B + +.grt7: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + popcnt c2 = x2 C I0 + xor x1 = u1, v1 C M I + ;; + ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + popcnt c3 = x3 C I0 + xor x2 = u2, v2 C M I + br .LL11 C B + + + ALIGN(32) +.Loop: ld8 u0 = [up], 8 C M01 + ld8 v0 = [vp], 8 C M01 + popcnt c2 = x2 C I0 + add s = s, c3 C M I + xor x1 = u1, v1 C M I + nop.b 1 C - + ;; +.LL00: ld8 u1 = [up], 8 C M01 + ld8 v1 = [vp], 8 C M01 + popcnt c3 = x3 C I0 + add s = s, c0 C M I + xor x2 = u2, v2 C M I + nop.b 1 C - + ;; +.LL11: ld8 u2 = [up], 8 C M01 + ld8 v2 = [vp], 8 C M01 + popcnt c0 = x0 C I0 + add s = s, c1 C M I + xor x3 = u3, v3 C M I + nop.b 1 C - + ;; +.LL10: ld8 u3 = [up], 8 C M01 + ld8 v3 = [vp], 8 C M01 + popcnt c1 = x1 C I0 + add s = s, c2 C M I + xor x0 = u0, v0 C M I + br.cloop.dptk .Loop C B + ;; + +.Lend: popcnt c2 = x2 C I0 + add s = s, c3 C M I + xor x1 = u1, v1 C M I + ;; +.Lcj8: popcnt c3 = x3 C I0 + add s = s, c0 C M I + xor x2 = u2, v2 C M I + ;; +.Lcj7: popcnt c0 = x0 C I0 + add s = s, c1 C M I + xor x3 = u3, v3 C M I + ;; +.Lcj6: popcnt c1 = x1 C I0 + add s = s, c2 C M I + ;; +.Lcj5: popcnt c2 = x2 C I0 + add s = s, c3 C M I + ;; +.Lcj4: popcnt c3 = x3 C I0 + add s = s, c0 C M I + ;; + add s = s, c1 C M I + ;; + add s = s, c2 C M I + ;; + add s = s, c3 C M I + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/ia64-defs.m4 b/gmp-6.3.0/mpn/ia64/ia64-defs.m4 new file mode 100644 index 0000000..f71d280 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/ia64-defs.m4 @@ -0,0 +1,147 @@ +divert(-1) + + +dnl Copyright 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl ia64 assembler comments are C++ style "//" to the end of line. gas +dnl also accepts "#" as a comment, if it's the first non-blank on a line. +dnl +dnl BSD m4 can't handle a multi-character comment like "//" (see notes in +dnl mpn/asm-defs.m4). For now the default "#" is left, but with care taken +dnl not to put any macros after "foo#" (since of course they won't expand). + + +define(`ASM_START', +m4_assert_numargs(0) +`') + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl 32-byte alignment is used for the benefit of itanium-2, where the code +dnl fetcher will only take 2 bundles from a 32-byte aligned target. At +dnl 16mod32 it only reads 1 in the first cycle. This might not make any +dnl difference if the rotate buffers are full or there's other work holding +dnl up execution, but we use 32-bytes to give the best chance of peak +dnl throughput. +dnl +dnl We can use .align here despite the gas bug noted in mpn/ia64/README, +dnl since we're not expecting to execute across a PROLOGUE(), at least not +dnl currently. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) + ` + .text + .align 32 + .global $1# + .proc $1# +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) + ` + .endp $1# +') + +define(`DATASTART', + `dnl + DATA +$1:') +define(`DATAEND',`dnl') + +define(`ASM_END',`dnl') + + +dnl Usage: ALIGN(bytes) +dnl +dnl Emit a ".align" directive. "bytes" is eval()ed, so can be an +dnl expression. +dnl +dnl This version overrides the definition in mpn/asm-defs.m4. We suppress +dnl any .align if the gas byte-swapped-nops bug was detected by configure +dnl GMP_ASM_IA64_ALIGN_OK. + +define(`ALIGN', +m4_assert_numargs(1) +m4_assert_defined(`IA64_ALIGN_OK') +`ifelse(IA64_ALIGN_OK,no,, +`.align eval($1)')') + + +dnl Usage: ASSERT([pr] [,code]) +dnl +dnl Require that the given predicate register is true after executing the +dnl test code. For example, +dnl +dnl ASSERT(p6, +dnl ` cmp.eq p6,p0 = r3, r4') +dnl +dnl If the predicate register argument is empty then nothing is tested, the +dnl code is just executed. This can be used for setups required by later +dnl ASSERTs. The code argument can be omitted to just test a predicate +dnl with no special setup code. +dnl +dnl For convenience, stops are inserted before and after the code emitted. + +define(ASSERT, +m4_assert_numargs_range(1,2) +m4_assert_defined(`WANT_ASSERT') +`ifelse(WANT_ASSERT,1, +` ;; +ifelse(`$2',,, +`$2 + ;; +') +ifelse(`$1',,, +`($1) br .LASSERTok`'ASSERT_label_counter ;; + cmp.ne p6,p6 = r0, r0 C illegal instruction + ;; +.LASSERTok`'ASSERT_label_counter: +define(`ASSERT_label_counter',eval(ASSERT_label_counter+1)) +') +')') +define(`ASSERT_label_counter',1) + +define(`getfsig', `getf.sig') +define(`setfsig', `setf.sig') +define(`cmpeq', `cmp.eq') +define(`cmpne', `cmp.ne') +define(`cmpltu', `cmp.ltu') +define(`cmpleu', `cmp.leu') +define(`cmpgtu', `cmp.gtu') +define(`cmpgeu', `cmp.geu') +define(`cmple', `cmp.le') +define(`cmpgt', `cmp.gt') +define(`cmpeqor', `cmp.eq.or') +define(`cmpequc', `cmp.eq.unc') + +divert diff --git a/gmp-6.3.0/mpn/ia64/invert_limb.asm b/gmp-6.3.0/mpn/ia64/invert_limb.asm new file mode 100644 index 0000000..5effdda --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/invert_limb.asm @@ -0,0 +1,105 @@ +dnl IA-64 mpn_invert_limb -- Invert a normalized limb. + +dnl Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde. + +dnl Copyright 2000, 2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C d = r32 + +C cycles +C Itanium: 74 +C Itanium 2: 50+6 + +C It should be possible to avoid the xmpy.hu and the following tests by +C explicitly chopping in the last fma. That would save about 10 cycles. + +ASM_START() + .sdata + .align 16 +ifdef(`HAVE_DOUBLE_IEEE_LITTLE_ENDIAN',` +.LC0: data4 0x00000000, 0x80000000, 0x0000403f, 0x00000000 C 2^64 +.LC1: data4 0x00000000, 0x80000000, 0x0000407f, 0x00000000 C 2^128 + +',`ifdef(`HAVE_DOUBLE_IEEE_BIG_ENDIAN',` +.LC0: data4 0x403f8000, 0x00000000, 0x00000000, 0x00000000 C 2^64 +.LC1: data4 0x407f8000, 0x00000000, 0x00000000, 0x00000000 C 2^128 + +',`m4_error(`Oops, need to know float endianness +')')') + + +PROLOGUE(mpn_invert_limb) + C 00 + addl r14 = @gprel(.LC0), gp + addl r15 = @gprel(.LC1), gp + setf.sig f7 = r32 + add r9 = r32, r32 C check for d = 2^63 + ;; C 01 + ldfe f10 = [r14] C 2^64 + ldfe f8 = [r15] C 2^128 + cmp.eq p6, p0 = 0, r9 C check for d = 2^63 + mov r8 = -1 C retval for 2^63 + (p6) br.ret.spnt.many b0 + ;; C 07 + fmpy.s1 f11 = f7, f10 C f11 = d * 2^64 + fnma.s1 f6 = f7, f10, f8 C f6 = 2^128 - d * 2^64 + ;; C 11 + frcpa.s1 f8, p6 = f6, f7 + ;; C 15 + (p6) fnma.s1 f9 = f7, f8, f1 + (p6) fmpy.s1 f10 = f6, f8 + ;; C 19 + (p6) fmpy.s1 f11 = f9, f9 + (p6) fma.s1 f10 = f9, f10, f10 + ;; C 23 + (p6) fma.s1 f8 = f9, f8, f8 + (p6) fma.s1 f9 = f11, f10, f10 + ;; C 27 + (p6) fma.s1 f8 = f11, f8, f8 + (p6) fnma.s1 f10 = f7, f9, f6 + ;; C 31 + (p6) fma.s1 f8 = f10, f8, f9 + ;; C 35 + fcvt.fxu.trunc.s1 f8 = f8 + ;; C 39 + getf.sig r8 = f8 + xmpy.hu f10 = f8, f7 C di * d + ;; C 43 + getf.sig r14 = f10 + andcm r9 = -1, r32 C one's complement + ;; C 48 + cmp.ltu p6, p0 = r9, r14 C got overflow? + ;; C 49 + (p6) add r8 = -1, r8 C adjust di down + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/logops_n.asm b/gmp-6.3.0/mpn/ia64/logops_n.asm new file mode 100644 index 0000000..e4a2f61 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/logops_n.asm @@ -0,0 +1,292 @@ +dnl IA-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, +dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2 +C Itanium 2: 1 + +C TODO +C * Use rp,rpx scheme of aors_n.asm to allow parallel stores (useful in +C wind-down code). + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`vp', `r34') +define(`n', `r35') + +ifdef(`OPERATION_and_n', +` define(`func',`mpn_and_n') + define(`logop', `and $1 = $2, $3') + define(`notormov', `mov $1 = $2')') +ifdef(`OPERATION_andn_n', +` define(`func',`mpn_andn_n') + define(`logop', `andcm $1 = $2, $3') + define(`notormov', `mov $1 = $2')') +ifdef(`OPERATION_nand_n', +` define(`func',`mpn_nand_n') + define(`logop', `and $1 = $2, $3') + define(`notormov', `sub $1 = -1, $2')') +ifdef(`OPERATION_ior_n', +` define(`func',`mpn_ior_n') + define(`logop', `or $1 = $2, $3') + define(`notormov', `mov $1 = $2')') +ifdef(`OPERATION_iorn_n', +` define(`func',`mpn_iorn_n') + define(`logop', `andcm $1 = $3, $2') + define(`notormov', `sub $1 = -1, $2')') +ifdef(`OPERATION_nior_n', +` define(`func',`mpn_nior_n') + define(`logop', `or $1 = $2, $3') + define(`notormov', `sub $1 = -1, $2')') +ifdef(`OPERATION_xor_n', +` define(`func',`mpn_xor_n') + define(`logop', `xor $1 = $2, $3') + define(`notormov', `mov $1 = $2')') +ifdef(`OPERATION_xnor_n', +` define(`func',`mpn_xnor_n') + define(`logop', `xor $1 = $2, $3') + define(`notormov', `sub $1 = -1, $2')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I + nop.m 0 + nop.m 0 + zxt4 n = n C I + ;; +') +{.mmi + ld8 r10 = [up], 8 C M + ld8 r11 = [vp], 8 C M + mov.i r2 = ar.lc C I0 +} +{.mmi + and r14 = 3, n C M I + cmp.lt p15, p14 = 4, n C M I + shr.u n = n, 2 C I0 + ;; +} +{.mmi + cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +} +{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + +.Lb00: ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + add n = -2, n C M I + ;; + ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + ;; + ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + (p15) br.cond.dpnt .grt4 C B + + logop( r14, r10, r11) C M I + ;; + logop( r15, r17, r21) C M I + notormov( r8, r14) C M I + br .Lcj4 C B + +.grt4: logop( r14, r10, r11) C M I + ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + ;; + logop( r15, r17, r21) C M I + ld8 r17 = [up], 8 C M + mov.i ar.lc = n C I0 + notormov( r8, r14) C M I + ld8 r21 = [vp], 8 C M + br .LL00 C B + +.Lb01: add n = -1, n C M I + logop( r15, r10, r11) C M I + (p15) br.cond.dpnt .grt1 C B + ;; + + notormov( r9, r15) C M I + br .Lcj1 C B + +.grt1: ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + ;; + ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + mov.i ar.lc = n C I0 + ;; + ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + ;; + ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + br.cloop.dptk .grt5 C B + ;; + + logop( r14, r16, r20) C M I + notormov( r9, r15) C M I + br .Lcj5 C B + +.grt5: logop( r14, r16, r20) C M I + ld8 r16 = [up], 8 C M + notormov( r9, r15) C M I + ld8 r20 = [vp], 8 C M + br .LL01 C B + +.Lb10: ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + (p15) br.cond.dpnt .grt2 C B + + logop( r14, r10, r11) C M I + ;; + logop( r15, r19, r23) C M I + notormov( r8, r14) C M I + br .Lcj2 C B + +.grt2: ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + add n = -1, n C M I + ;; + ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + logop( r14, r10, r11) C M I + ;; + ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + mov.i ar.lc = n C I0 + ;; + logop( r15, r19, r23) C M I + ld8 r19 = [up], 8 C M + notormov( r8, r14) C M I + ld8 r23 = [vp], 8 C M + br.cloop.dptk .Loop C B + br .Lcj6 C B + +.Lb11: ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + add n = -1, n C M I + ;; + ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + logop( r15, r10, r11) C M I + (p15) br.cond.dpnt .grt3 C B + ;; + + logop( r14, r18, r22) C M I + notormov( r9, r15) C M I + br .Lcj3 C B + +.grt3: ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + ;; + ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + mov.i ar.lc = n C I0 + ;; + logop( r14, r18, r22) C M I + ld8 r18 = [up], 8 C M + notormov( r9, r15) C M I + ld8 r22 = [vp], 8 C M + br .LL11 C B + +C *** MAIN LOOP START *** + ALIGN(32) +.Loop: st8 [rp] = r8, 8 C M + logop( r14, r16, r20) C M I + notormov( r9, r15) C M I + ld8 r16 = [up], 8 C M + ld8 r20 = [vp], 8 C M + nop.b 0 + ;; +.LL01: st8 [rp] = r9, 8 C M + logop( r15, r17, r21) C M I + notormov( r8, r14) C M I + ld8 r17 = [up], 8 C M + ld8 r21 = [vp], 8 C M + nop.b 0 + ;; +.LL00: st8 [rp] = r8, 8 C M + logop( r14, r18, r22) C M I + notormov( r9, r15) C M I + ld8 r18 = [up], 8 C M + ld8 r22 = [vp], 8 C M + nop.b 0 + ;; +.LL11: st8 [rp] = r9, 8 C M + logop( r15, r19, r23) C M I + notormov( r8, r14) C M I + ld8 r19 = [up], 8 C M + ld8 r23 = [vp], 8 C M + br.cloop.dptk .Loop ;; C B +C *** MAIN LOOP END *** + +.Lcj6: st8 [rp] = r8, 8 C M + logop( r14, r16, r20) C M I + notormov( r9, r15) C M I + ;; +.Lcj5: st8 [rp] = r9, 8 C M + logop( r15, r17, r21) C M I + notormov( r8, r14) C M I + ;; +.Lcj4: st8 [rp] = r8, 8 C M + logop( r14, r18, r22) C M I + notormov( r9, r15) C M I + ;; +.Lcj3: st8 [rp] = r9, 8 C M + logop( r15, r19, r23) C M I + notormov( r8, r14) C M I + ;; +.Lcj2: st8 [rp] = r8, 8 C M + notormov( r9, r15) C M I + ;; +.Lcj1: st8 [rp] = r9, 8 C M + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/lorrshift.asm b/gmp-6.3.0/mpn/ia64/lorrshift.asm new file mode 100644 index 0000000..694aaf0 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/lorrshift.asm @@ -0,0 +1,358 @@ +dnl IA-64 mpn_lshift/mpn_rshift. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2 +C Itanium 2: 1 + +C This code is scheduled deeply since the plain shift instructions shr and shl +C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of +C these instructions cause a 10 cycle replay trap on Itanium. + +C The ld8 scheduling should probably be decreased to make the function smaller. +C Good lfetch will make sure we never stall anyway. + +C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair +C at cycle 2. Judicious use of predicates could allow us to issue more ld8's +C in the prologue. + + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`cnt',`r35') + +define(`tnc',`r9') + +ifdef(`OPERATION_lshift',` + define(`FSH',`shl') + define(`BSH',`shr.u') + define(`UPD',`-8') + define(`POFF',`-512') + define(`PUPD',`-32') + define(`func',`mpn_lshift') +') +ifdef(`OPERATION_rshift',` + define(`FSH',`shr.u') + define(`BSH',`shl') + define(`UPD',`8') + define(`POFF',`512') + define(`PUPD',`32') + define(`func',`mpn_rshift') +') + +MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + sxt4 n = n C M I + nop.m 0 + nop.m 0 + zxt4 cnt = cnt C I + ;; +') + + {.mmi; cmp.lt p14, p15 = 4, n C M I + and r14 = 3, n C M I + mov.i r2 = ar.lc C I0 +}{.mmi; add r15 = -1, n C M I + sub tnc = 64, cnt C M I + add r16 = -5, n + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + shr.u n = r16, 2 C I0 +}{.mmi; cmp.eq p8, p0 = 3, r14 C M I +ifdef(`OPERATION_lshift', +` shladd up = r15, 3, up C M I + shladd rp = r15, 3, rp') C M I + ;; +}{.mmi; add r11 = POFF, up C M I + ld8 r10 = [up], UPD C M01 + mov.i ar.lc = n C I0 +}{.bbb; + (p6) br.dptk .Lb01 + (p7) br.dptk .Lb10 + (p8) br.dptk .Lb11 + ;; } + +.Lb00: ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc C function return value + ;; + FSH r24 = r10, cnt + BSH r25 = r19, tnc + (p14) br.cond.dptk .grt4 + ;; + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + BSH r23 = r10, tnc + br .Lr4 + +.grt4: ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + or r14 = r25, r24 + ld8 r17 = [up], UPD + br.cloop.dpnt .Ltop + br .Lbot + +.Lb01: + (p15) BSH r8 = r10, tnc C function return value I + (p15) FSH r22 = r10, cnt C I + (p15) br.cond.dptk .Lr1 C return B + +.grt1: ld8 r18 = [up], UPD + ;; + ld8 r19 = [up], UPD + BSH r8 = r10, tnc C function return value + ;; + ld8 r16 = [up], UPD + FSH r22 = r10, cnt + BSH r23 = r18, tnc + ;; + ld8 r17 = [up], UPD + FSH r24 = r18, cnt + BSH r25 = r19, tnc + br.cloop.dpnt .grt5 + ;; + or r15 = r23, r22 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + br .Lr5 + +.grt5: ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r15 = r23, r22 + ld8 r16 = [up], UPD + br .LL01 + + +.Lb10: ld8 r17 = [up], UPD + (p14) br.cond.dptk .grt2 + + BSH r8 = r10, tnc C function return value + ;; + FSH r20 = r10, cnt + BSH r21 = r17, tnc + ;; + or r14 = r21, r20 + FSH r22 = r17, cnt + br .Lr2 C return + +.grt2: ld8 r18 = [up], UPD + BSH r8 = r10, tnc C function return value + ;; + ld8 r19 = [up], UPD + FSH r20 = r10, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + {.mmi; ld8 r17 = [up], UPD + or r14 = r21, r20 + FSH r24 = r18, cnt +}{.mib; nop 0 + BSH r25 = r19, tnc + br.cloop.dpnt .grt6 + ;; } + + FSH r26 = r19, cnt + BSH r27 = r16, tnc + br .Lr6 + +.grt6: ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + br .LL10 + + +.Lb11: ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc C function return value + (p14) br.cond.dptk .grt3 + ;; + + FSH r26 = r10, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r15 = r27, r26 + FSH r22 = r17, cnt + br .Lr3 C return + +.grt3: ld8 r18 = [up], UPD + FSH r26 = r10, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + ld8 r17 = [up], UPD + br.cloop.dpnt .grt7 + + or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + br .Lr7 + +.grt7: or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ld8 r18 = [up], UPD + br .LL11 + +C *** MAIN LOOP START *** + ALIGN(32) +.Ltop: + {.mmi; st8 [rp] = r14, UPD C M2 + or r15 = r27, r26 C M3 + FSH r24 = r18, cnt C I0 +}{.mmi; ld8 r18 = [up], UPD C M1 + lfetch [r11], PUPD + BSH r25 = r19, tnc C I1 + ;; } +.LL11: + {.mmi; st8 [rp] = r15, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop.m 0 + BSH r27 = r16, tnc + ;; } +.LL10: + {.mmi; st8 [rp] = r14, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop.m 0 + BSH r21 = r17, tnc + ;; } +.LL01: + {.mmi; st8 [rp] = r15, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk .Ltop + ;; } +C *** MAIN LOOP END *** + +.Lbot: + {.mmi; st8 [rp] = r14, UPD + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mib; nop 0 + BSH r25 = r19, tnc + nop 0 + ;; } +.Lr7: + {.mmi; st8 [rp] = r15, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mib; nop 0 + BSH r27 = r16, tnc + nop 0 + ;; } +.Lr6: + {.mmi; st8 [rp] = r14, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mib; nop 0 + BSH r21 = r17, tnc + nop 0 + ;; } +.Lr5: st8 [rp] = r15, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt + ;; +.Lr4: st8 [rp] = r14, UPD + or r15 = r27, r26 + ;; +.Lr3: st8 [rp] = r15, UPD + or r14 = r21, r20 + ;; +.Lr2: st8 [rp] = r14, UPD + ;; +.Lr1: st8 [rp] = r22, UPD C M23 + mov ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE(func) +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/lshiftc.asm b/gmp-6.3.0/mpn/ia64/lshiftc.asm new file mode 100644 index 0000000..e8cec87 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/lshiftc.asm @@ -0,0 +1,463 @@ +dnl IA-64 mpn_lshiftc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.25 + +C This code is scheduled deeply since the plain shift instructions shr and shl +C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of +C these instructions cause a 10 cycle replay trap on Itanium. + +C The ld8 scheduling should probably be decreased to make the function smaller. +C Good lfetch will make sure we never stall anyway. + +C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair +C at cycle 2. Judicious use of predicates could allow us to issue more ld8's +C in the prologue. + + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`cnt',`r35') + +define(`tnc',`r9') + +define(`FSH',`shl') +define(`BSH',`shr.u') +define(`UPD',`-8') +define(`POFF',`-512') +define(`PUPD',`-32') +define(`func',`mpn_lshiftc') + +ASM_START() +PROLOGUE(mpn_lshiftc) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + sxt4 n = n C M I + nop.m 0 + nop.m 0 + zxt4 cnt = cnt C I + ;; +') + + {.mmi; nop 0 C M I + and r14 = 3, n C M I + mov.i r2 = ar.lc C I0 +}{.mmi; add r15 = -1, n C M I + sub tnc = 64, cnt C M I + nop 0 + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + shr.u n = r15, 2 C I0 +}{.mmi; cmp.eq p8, p0 = 3, r14 C M I + shladd up = r15, 3, up C M I + shladd rp = r15, 3, rp C M I + ;; +}{.mmi; add r11 = POFF, up C M I + ld8 r10 = [up], UPD C M01 + mov.i ar.lc = n C I0 +}{.bbb; + (p6) br.dptk .Lb01 + (p7) br.dptk .Lb10 + (p8) br.dptk .Lb11 + ;; } + +.Lb00: + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc + br.cloop.dptk L(gt4) + ;; + FSH r24 = r10, cnt + BSH r25 = r19, tnc + ;; + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + ;; + or r15 = r27, r26 + sub r31 = -1, r14 + br .Lr4 + +L(gt4): + {.mmi; nop 0 + nop 0 + FSH r24 = r10, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; } + {.mmi; nop 0 + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt8) + ;; } + {.mmi; nop 0 + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mib; sub r31 = -1, r14 + BSH r25 = r19, tnc + br .Lr8 } + +L(gt8): + or r15 = r27, r26 + FSH r24 = r18, cnt + ld8 r18 = [up], UPD + sub r31 = -1, r14 + BSH r25 = r19, tnc + br .LL00 + +.Lb01: + br.cloop.dptk L(gt1) + ;; + BSH r8 = r10, tnc + FSH r22 = r10, cnt + ;; + sub r31 = -1, r22 + br .Lr1 + ;; +L(gt1): + ld8 r18 = [up], UPD + BSH r8 = r10, tnc + FSH r22 = r10, cnt + ;; + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt5) + ;; + nop 0 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + nop 0 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + or r15 = r23, r22 + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + sub r31 = -1, r15 + br .Lr5 + +L(gt5): + {.mmi; nop 0 + nop 0 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + nop 0 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; nop 0 + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; } + {.mmi; or r14 = r25, r24 + sub r31 = -1, r15 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br L(end) + ;; } + +.Lb10: + ld8 r17 = [up], UPD + br.cloop.dptk L(gt2) + ;; + BSH r8 = r10, tnc + FSH r20 = r10, cnt + ;; + BSH r21 = r17, tnc + FSH r22 = r17, cnt + ;; + or r14 = r21, r20 + ;; + sub r31 = -1, r14 + br .Lr2 + ;; +L(gt2): + ld8 r18 = [up], UPD + BSH r8 = r10, tnc + FSH r20 = r10, cnt + ;; + ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + BSH r21 = r17, tnc + FSH r22 = r17, cnt + ;; + ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt6) + ;; + nop 0 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + or r14 = r21, r20 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + {.mmi; nop 0 + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mib; sub r31 = -1, r14 + BSH r21 = r17, tnc + br .Lr6 + ;; } +L(gt6): + {.mmi; nop 0 + nop 0 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; nop 0 + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop 0 + BSH r27 = r16, tnc + ;; } + {.mmi; or r15 = r23, r22 + sub r31 = -1, r14 + FSH r20 = r16, cnt +}{.mib; ld8 r16 = [up], UPD + BSH r21 = r17, tnc + br .LL10 +} + +.Lb11: + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc + FSH r26 = r10, cnt + br.cloop.dptk L(gt3) + ;; + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + FSH r22 = r17, cnt + ;; + or r15 = r27, r26 + ;; + or r14 = r21, r20 + sub r31 = -1, r15 + br .Lr3 + ;; +L(gt3): + ld8 r18 = [up], UPD + ;; + ld8 r19 = [up], UPD + BSH r27 = r16, tnc + ;; + {.mmi; nop 0 + nop 0 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop 0 + BSH r21 = r17, tnc + ;; +}{.mmi; nop 0 + nop 0 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk L(gt7) + ;; } + or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + {.mmi; nop 0 + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mib; sub r31 = -1, r15 + BSH r27 = r16, tnc + br .Lr7 +} +L(gt7): + {.mmi; nop 0 + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mmi; ld8 r18 = [up], UPD + nop 0 + BSH r25 = r19, tnc + ;; } + {.mmi; or r14 = r21, r20 + sub r31 = -1, r15 + FSH r26 = r19, cnt +}{.mib; ld8 r19 = [up], UPD + BSH r27 = r16, tnc + br .LL11 +} + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): +.LL01: + {.mmi; st8 [rp] = r31, UPD C M2 + or r15 = r27, r26 C M3 + FSH r24 = r18, cnt C I0 +}{.mmi; ld8 r18 = [up], UPD C M0 + sub r31 = -1, r14 C M1 + BSH r25 = r19, tnc C I1 + ;; } +.LL00: + {.mmi; st8 [rp] = r31, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + sub r31 = -1, r15 + BSH r27 = r16, tnc + ;; } +.LL11: + {.mmi; st8 [rp] = r31, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + sub r31 = -1, r14 + BSH r21 = r17, tnc + ;; } +.LL10: + {.mmi; st8 [rp] = r31, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mmi; ld8 r17 = [up], UPD + sub r31 = -1, r15 + BSH r23 = r18, tnc + ;; } +L(end): lfetch [r11], PUPD + br.cloop.dptk L(top) +C *** MAIN LOOP END *** + + {.mmi; st8 [rp] = r31, UPD + or r15 = r27, r26 + FSH r24 = r18, cnt +}{.mib; sub r31 = -1, r14 + BSH r25 = r19, tnc + nop 0 + ;; } +.Lr8: + {.mmi; st8 [rp] = r31, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mib; sub r31 = -1, r15 + BSH r27 = r16, tnc + nop 0 + ;; } +.Lr7: + {.mmi; st8 [rp] = r31, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mib; sub r31 = -1, r14 + BSH r21 = r17, tnc + nop 0 + ;; } +.Lr6: st8 [rp] = r31, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt + sub r31 = -1, r15 + ;; +.Lr5: st8 [rp] = r31, UPD + or r15 = r27, r26 + sub r31 = -1, r14 + ;; +.Lr4: st8 [rp] = r31, UPD + or r14 = r21, r20 + sub r31 = -1, r15 + ;; +.Lr3: st8 [rp] = r31, UPD + sub r31 = -1, r14 + ;; +.Lr2: st8 [rp] = r31, UPD + sub r31 = -1, r22 + ;; +.Lr1: st8 [rp] = r31, UPD C M23 + mov ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE(func) +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm b/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm new file mode 100644 index 0000000..7789117 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mod_34lsub1.asm @@ -0,0 +1,237 @@ +dnl IA-64 mpn_mod_34lsub1 + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1 + + +C INPUT PARAMETERS +define(`up', `r32') +define(`n', `r33') + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') +define(`a0',`r17') define(`a1',`r18') define(`a2',`r19') +define(`c0',`r20') define(`c1',`r21') define(`c2',`r22') + +C This is a fairly simple-minded implementation. One could approach 0.67 c/l +C with a more sophisticated implementation. If we're really crazy, we could +C super-unroll, storing carries just in predicate registers, then copy them to +C a general register, and population count them from there. That'd bring us +C close to 3 insn/limb, for nearly 0.5 c/l. + +C Computing n/3 needs 16 cycles, which is a lot of startup overhead. +C We therefore use a plain while-style loop: +C add n = -3, n +C cmp.le p9, p0 = 3, n +C (p9) br.cond .Loop +C Alternatively, we could table n/3 for, say, n < 256, and predicate the +C 16-cycle code. + +C The summing-up code at the end was written quickly, and could surely be +C vastly improved. + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 up = 0, up C M I + nop.m 0 + zxt4 n = n C I + ;; +') + +ifelse(0,1,` + movl r14 = 0xAAAAAAAAAAAAAAAB + ;; + setf.sig f6 = r14 + setf.sig f7 = r33 + ;; + xmpy.hu f6 = f6, f7 + ;; + getf.sig r8 = f6 + ;; + shr.u r8 = r8, 1 C Loop count + ;; + mov.i ar.lc = r8 +') + + ld8 u0 = [up], 8 + cmp.ne p9, p0 = 1, n + (p9) br L(gt1) + ;; + shr.u r8 = u0, 48 + dep.z r27 = u0, 0, 48 + ;; + add r8 = r8, r27 + br.ret.sptk.many b0 + + +L(gt1): + {.mmi; nop.m 0 + mov a0 = 0 + add n = -2, n +}{.mmi; mov c0 = 0 + mov c1 = 0 + mov c2 = 0 + ;; +}{.mmi; ld8 u1 = [up], 8 + mov a1 = 0 + cmp.ltu p6, p0 = r0, r0 C clear p6 +}{.mmb; cmp.gt p9, p0 = 3, n + mov a2 = 0 + (p9) br.cond.dptk L(end) + ;; +} + ALIGN(32) +L(top): + {.mmi; ld8 u2 = [up], 8 + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 +}{.mmb; sub a0 = a0, u0 + add n = -3, n + nop.b 0 + ;; +}{.mmi; ld8 u0 = [up], 8 + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 +}{.mmb; sub a1 = a1, u1 + cmp.le p9, p0 = 3, n + nop.b 0 + ;; +}{.mmi; ld8 u1 = [up], 8 + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 +}{.mmb; sub a2 = a2, u2 + nop.m 0 +dnl br.cloop.dptk L(top) + (p9) br.cond.dptk L(top) + ;; +} +L(end): + cmp.eq p10, p0 = 0, n + cmp.eq p11, p0 = 1, n + (p10) br L(0) + +L(2): + {.mmi; ld8 u2 = [up], 8 + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 +}{.mmb; sub a0 = a0, u0 + nop.m 0 + (p11) br L(1) + ;; +} ld8 u0 = [up], 8 + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 + sub a2 = a2, u2 + ;; + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 + sub a0 = a0, u0 + ;; + (p7) add c1 = 1, c1 + br L(com) + + +L(1): + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + cmp.ltu p6, p0 = a2, u2 + sub a2 = a2, u2 + ;; + (p6) add c0 = 1, c0 + br L(com) + + +L(0): + (p6) add c0 = 1, c0 + cmp.ltu p7, p0 = a0, u0 + sub a0 = a0, u0 + ;; + (p7) add c1 = 1, c1 + cmp.ltu p8, p0 = a1, u1 + sub a1 = a1, u1 + ;; + (p8) add c2 = 1, c2 + +L(com): +C | a2 | a1 | a0 | +C | | | | | + shr.u r24 = a0, 48 C 16 bits + shr.u r25 = a1, 32 C 32 bits + shr.u r26 = a2, 16 C 48 bits + ;; + shr.u r10 = c0, 48 C 16 bits, always zero + shr.u r11 = c1, 32 C 32 bits + shr.u r30 = c2, 16 C 48 bits + ;; + dep.z r27 = a0, 0, 48 C 48 bits + dep.z r28 = a1, 16, 32 C 48 bits + dep.z r29 = a2, 32, 16 C 48 bits + dep.z r31 = c0, 0, 48 C 48 bits + dep.z r14 = c1, 16, 32 C 48 bits + dep.z r15 = c2, 32, 16 C 48 bits + ;; + {.mmi; add r24 = r24, r25 + add r26 = r26, r27 + add r28 = r28, r29 +}{.mmi; add r10 = r10, r11 + add r30 = r30, r31 + add r14 = r14, r15 + ;; +} + movl r8 = 0xffffffffffff0 + add r24 = r24, r26 + add r10 = r10, r30 + ;; + add r24 = r24, r28 + add r10 = r10, r14 + ;; + sub r8 = r8, r24 + ;; + add r8 = r8, r10 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/mode1o.asm b/gmp-6.3.0/mpn/ia64/mode1o.asm new file mode 100644 index 0000000..14d5e81 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mode1o.asm @@ -0,0 +1,342 @@ +dnl Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder. + +dnl Contributed to the GNU project by Kevin Ryde. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Itanium: 15 +C Itanium 2: 8 + + +dnl Usage: ABI32(`code') +dnl +dnl Emit the given code only under HAVE_ABI_32. +dnl +define(ABI32, +m4_assert_onearg() +`ifdef(`HAVE_ABI_32',`$1')') + + +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C The modexact algorithm is usually conceived as a dependent chain +C +C l = src[i] - c +C q = low(l * inverse) +C c = high(q*divisor) + (src[i]=2), and the +C calculation of q by the initial different scheme. +C +C +C Entry Sequence: +C +C In the entry sequence, the critical path is the calculation of the +C inverse, so this is begun first and optimized. Apart from that, ar.lc is +C established nice and early so the br.cloop's should predict perfectly. +C And the load for the low limbs src[0] and src[1] can be initiated long +C ahead of where they're needed. +C +C +C Inverse Calculation: +C +C The initial 8-bit inverse is calculated using a table lookup. If it hits +C L1 (which is likely if we're called several times) then it should take a +C total 4 cycles, otherwise hopefully L2 for 9 cycles. This is considered +C the best approach, on balance. It could be done bitwise, but that would +C probably be about 14 cycles (2 per bit beyond the first couple). Or it +C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits, +C but that would be about 11 cycles. +C +C The table is not the same as binvert_limb_table, instead it's 256 bytes, +C designed to be indexed by the low byte of the divisor. The divisor is +C always odd, so the relevant data is every second byte in the table. The +C padding lets us use zxt1 instead of extr.u, the latter would cost an extra +C cycle because it must go down I0, and we're using the first I0 slot to get +C ip. The extra 128 bytes of padding should be insignificant compared to +C typical ia64 code bloat. +C +C Having the table in .text allows us to use IP-relative addressing, +C avoiding a fetch from ltoff. .rodata is apparently not suitable for use +C IP-relative, it gets a linker relocation overflow on GNU/Linux. +C +C +C Load Scheduling: +C +C In the main loop, the data loads are scheduled for an L2 hit, which means +C 6 cycles for the data ready to use. In fact we end up 7 cycles ahead. In +C any case that scheduling is achieved simply by doing the load (and xmpy.l +C for "si") in the immediately preceding iteration. +C +C The main loop requires size >= 2, and we handle size==1 by an initial +C br.cloop to enter the loop only if size>1. Since ar.lc is established +C early, this should predict perfectly. +C +C +C Not done: +C +C Consideration was given to using a plain "(src[0]-c) % divisor" for +C size==1, but cycle counting suggests about 50 for the sort of approach +C taken by gcc __umodsi3, versus about 47 for the modexact. (Both assuming +C L1 hits for their respective fetching.) +C +C Consideration was given to a test for high 1 + ;; + + C size==1, finish up now + xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c) + mov ar.lc = r2 C I0 + ;; + getf.sig r8 = f9 C M2 return c + br.ret.sptk.many b0 + + + +.Ltop: + C r2 saved ar.lc + C f6 divisor + C f7 inverse + C f8 -inverse + C f9 carry + C f10 src[i] * inverse + C f11 scratch src[i+1] + + add r16 = 160, r32 + ldf8 f11 = [r32], 8 C src[i+1] + ;; + C 2 cycles + + lfetch [r16] + xma.l f10 = f9, f8, f10 C q = c * -inverse + si + ;; + C 3 cycles + +.Lentry: + xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c) + xmpy.l f10 = f11, f7 C si = src[i] * inverse + br.cloop.sptk.few.clr .Ltop + ;; + + + + xma.l f10 = f9, f8, f10 C q = c * -inverse + si + mov ar.lc = r2 C I0 + ;; + xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c) + ;; + getf.sig r8 = f9 C M2 return c + br.ret.sptk.many b0 + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/mul_1.asm b/gmp-6.3.0/mpn/ia64/mul_1.asm new file mode 100644 index 0000000..21bf6d0 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mul_1.asm @@ -0,0 +1,584 @@ +dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and +dnl store the result in a second limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 4.0 +C Itanium 2: 2.0 + +C TODO +C * Further optimize feed-in and wind-down code, both for speed and code size. +C * Handle low limb input and results specially, using a common stf8 in the +C epilogue. +C * Use 1 c/l carry propagation scheme in wind-down code. +C * Use extra pointer register for `up' to speed up feed-in loads. +C * Work out final differences with addmul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`vl', `r35') +define(`cy', `r36') C for mpn_mul_1c + +ASM_START() +PROLOGUE(mpn_mul_1) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mfi + adds r15 = -1, n C M I + mov f9 = f0 C F + mov.i r2 = ar.lc C I0 +} +{.mmi + ldf8 f7 = [up], 8 C M + nop.m 0 C M + and r14 = 3, n C M I + ;; +} +.Lcommon: +{.mii + setf.sig f6 = vl C M2 M3 + shr.u r31 = r15, 2 C I0 + cmp.eq p10, p0 = 0, r14 C M I +} +{.mii + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + nop.i 0 C I + ;; +} +{.mii + cmp.ne p6, p7 = r0, r0 C M I + mov.i ar.lc = r31 C I0 + cmp.ne p8, p9 = r0, r0 C M I +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: mov r20 = 0 + br.cloop.dptk .grt1 C B + + xma.l f39 = f7, f6, f9 C F + xma.hu f43 = f7, f6, f9 C F + ;; + getf.sig r8 = f43 C M2 + stf8 [rp] = f39 C M2 M3 + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B + +.grt1: + ldf8 f32 = [up], 8 + ;; + ldf8 f33 = [up], 8 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f7, f6, f9 + xma.hu f43 = f7, f6, f9 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt5 + + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + stf8 [rp] = f39, 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + getf.sig r16 = f38 + br .Lcj5 + +.grt5: + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + ;; + getf.sig r18 = f36 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + ;; + getf.sig r19 = f37 + xma.hu f43 = f35, f6, f0 + br .LL01 + + +.Lb10: ldf8 f35 = [up], 8 + mov r23 = 0 + br.cloop.dptk .grt2 + + xma.l f38 = f7, f6, f9 + xma.hu f42 = f7, f6, f9 + ;; + stf8 [rp] = f38, 8 + xma.l f39 = f35, f6, f42 + xma.hu f43 = f35, f6, f42 + ;; + getf.sig r8 = f43 + stf8 [rp] = f39 + mov.i ar.lc = r2 + br.ret.sptk.many b0 + + +.grt2: + ldf8 f32 = [up], 8 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f7, f6, f9 + xma.hu f42 = f7, f6, f9 + ;; + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt6 + + stf8 [rp] = f38, 8 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + br .Lcj6 + +.grt6: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + ;; + getf.sig r17 = f39 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r21 = f43 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + ;; + getf.sig r18 = f36 + xma.hu f42 = f34, f6, f0 + br .LL10 + + +.Lb11: ldf8 f34 = [up], 8 + mov r22 = 0 + ;; + ldf8 f35 = [up], 8 + br.cloop.dptk .grt3 + ;; + + xma.l f37 = f7, f6, f9 + xma.hu f41 = f7, f6, f9 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + stf8 [rp] = f37, 8 + getf.sig r16 = f38 + getf.sig r20 = f42 + getf.sig r17 = f39 + getf.sig r8 = f43 + br .Lcj3 + +.grt3: + ldf8 f32 = [up], 8 + xma.l f37 = f7, f6, f9 + xma.hu f41 = f7, f6, f9 + ;; + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r19 = f37 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt7 + + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + getf.sig r20 = f42 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + getf.sig r21 = f43 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r18 = f36 + st8 [rp] = r19, 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + br .Lcj7 + +.grt7: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r20 = f42 + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + ;; + getf.sig r17 = f39 + xma.hu f41 = f33, f6, f0 + br .LL11 + + +.Lb00: ldf8 f33 = [up], 8 + mov r21 = 0 + ;; + ldf8 f34 = [up], 8 + ;; + ldf8 f35 = [up], 8 + xma.l f36 = f7, f6, f9 + xma.hu f40 = f7, f6, f9 + br.cloop.dptk .grt4 + + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + stf8 [rp] = f36, 8 + xma.l f39 = f35, f6, f0 + getf.sig r19 = f37 + xma.hu f43 = f35, f6, f0 + ;; + getf.sig r23 = f41 + getf.sig r16 = f38 + getf.sig r20 = f42 + getf.sig r17 = f39 + br .Lcj4 + +.grt4: + ldf8 f32 = [up], 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + ;; + getf.sig r18 = f36 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f0 + xma.hu f42 = f34, f6, f0 + ;; + getf.sig r22 = f40 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f0 + ;; + getf.sig r19 = f37 + getf.sig r23 = f41 + xma.hu f43 = f35, f6, f0 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt8 + + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + getf.sig r20 = f42 + xma.hu f40 = f32, f6, f0 + ;; + getf.sig r17 = f39 + st8 [rp] = r18, 8 + xma.l f37 = f33, f6, f0 + xma.hu f41 = f33, f6, f0 + br .Lcj8 + +.grt8: + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + xma.hu f40 = f32, f6, f0 + br .LL00 + + +C *** MAIN LOOP START *** + ALIGN(32) +.Loop: + .pred.rel "mutex",p6,p7 + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + (p6) cmp.leu p8, p9 = r24, r17 + st8 [rp] = r24, 8 + xma.hu f40 = f32, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; +.LL00: + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + nop.b 0 + ldf8 f32 = [up], 8 + (p9) add r24 = r18, r21 + nop.b 0 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + (p8) cmp.leu p6, p7 = r24, r18 + st8 [rp] = r24, 8 + xma.hu f41 = f33, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r18 + ;; +.LL11: + .pred.rel "mutex",p6,p7 + getf.sig r21 = f43 + (p6) add r24 = r19, r22, 1 + nop.b 0 + ldf8 f33 = [up], 8 + (p7) add r24 = r19, r22 + nop.b 0 + ;; + .pred.rel "mutex",p6,p7 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + (p6) cmp.leu p8, p9 = r24, r19 + st8 [rp] = r24, 8 + xma.hu f42 = f34, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.LL10: + .pred.rel "mutex",p8,p9 + getf.sig r22 = f40 + (p8) add r24 = r16, r23, 1 + nop.b 0 + ldf8 f34 = [up], 8 + (p9) add r24 = r16, r23 + nop.b 0 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + (p8) cmp.leu p6, p7 = r24, r16 + st8 [rp] = r24, 8 + xma.hu f43 = f35, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.LL01: + .pred.rel "mutex",p6,p7 + getf.sig r23 = f41 + (p6) add r24 = r17, r20, 1 + nop.b 0 + ldf8 f35 = [up], 8 + (p7) add r24 = r17, r20 + br.cloop.dptk .Loop +C *** MAIN LOOP END *** + ;; + +.Lcj9: + .pred.rel "mutex",p6,p7 + getf.sig r16 = f38 + xma.l f36 = f32, f6, f0 + (p6) cmp.leu p8, p9 = r24, r17 + st8 [rp] = r24, 8 + xma.hu f40 = f32, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + (p9) add r24 = r18, r21 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r17 = f39 + xma.l f37 = f33, f6, f0 + (p8) cmp.leu p6, p7 = r24, r18 + st8 [rp] = r24, 8 + xma.hu f41 = f33, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r18 + ;; +.Lcj8: + .pred.rel "mutex",p6,p7 + getf.sig r21 = f43 + (p6) add r24 = r19, r22, 1 + (p7) add r24 = r19, r22 + ;; + .pred.rel "mutex",p6,p7 + getf.sig r18 = f36 + xma.l f38 = f34, f6, f0 + (p6) cmp.leu p8, p9 = r24, r19 + st8 [rp] = r24, 8 + xma.hu f42 = f34, f6, f0 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.Lcj7: + .pred.rel "mutex",p8,p9 + getf.sig r22 = f40 + (p8) add r24 = r16, r23, 1 + (p9) add r24 = r16, r23 + ;; + .pred.rel "mutex",p8,p9 + getf.sig r19 = f37 + xma.l f39 = f35, f6, f0 + (p8) cmp.leu p6, p7 = r24, r16 + st8 [rp] = r24, 8 + xma.hu f43 = f35, f6, f0 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.Lcj6: + .pred.rel "mutex",p6,p7 + getf.sig r23 = f41 + (p6) add r24 = r17, r20, 1 + (p7) add r24 = r17, r20 + ;; + .pred.rel "mutex",p6,p7 + (p6) cmp.leu p8, p9 = r24, r17 + (p7) cmp.ltu p8, p9 = r24, r17 + getf.sig r16 = f38 + st8 [rp] = r24, 8 + ;; +.Lcj5: + .pred.rel "mutex",p8,p9 + getf.sig r20 = f42 + (p8) add r24 = r18, r21, 1 + (p9) add r24 = r18, r21 + ;; + .pred.rel "mutex",p8,p9 + (p8) cmp.leu p6, p7 = r24, r18 + (p9) cmp.ltu p6, p7 = r24, r18 + getf.sig r17 = f39 + st8 [rp] = r24, 8 + ;; +.Lcj4: + .pred.rel "mutex",p6,p7 + getf.sig r8 = f43 + (p6) add r24 = r19, r22, 1 + (p7) add r24 = r19, r22 + ;; + .pred.rel "mutex",p6,p7 + st8 [rp] = r24, 8 + (p6) cmp.leu p8, p9 = r24, r19 + (p7) cmp.ltu p8, p9 = r24, r19 + ;; +.Lcj3: + .pred.rel "mutex",p8,p9 + (p8) add r24 = r16, r23, 1 + (p9) add r24 = r16, r23 + ;; + .pred.rel "mutex",p8,p9 + st8 [rp] = r24, 8 + (p8) cmp.leu p6, p7 = r24, r16 + (p9) cmp.ltu p6, p7 = r24, r16 + ;; +.Lcj2: + .pred.rel "mutex",p6,p7 + (p6) add r24 = r17, r20, 1 + (p7) add r24 = r17, r20 + ;; + .pred.rel "mutex",p6,p7 + st8 [rp] = r24, 8 + (p6) cmp.leu p8, p9 = r24, r17 + (p7) cmp.ltu p8, p9 = r24, r17 + ;; + (p8) add r8 = 1, r8 + mov.i ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() + +PROLOGUE(mpn_mul_1c) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmi + adds r15 = -1, n C M I + setf.sig f9 = cy C M2 M3 + mov.i r2 = ar.lc C I0 +} +{.mmb + ldf8 f7 = [up], 8 C M + and r14 = 3, n C M I + br.sptk .Lcommon + ;; +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/mul_2.asm b/gmp-6.3.0/mpn/ia64/mul_2.asm new file mode 100644 index 0000000..5343f64 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/mul_2.asm @@ -0,0 +1,625 @@ +dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store +dnl store the result to a (n+1)-limb number. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2004, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 1.5 + +C TODO +C * Clean up variable names, and try to decrease the number of distinct +C registers used. +C * Clean up feed-in code to not require zeroing several registers. +C * Make sure we don't depend on uninitialized predicate registers. +C * Could perhaps save a few cycles by using 1 c/l carry propagation in +C wind-down code. +C * Ultimately rewrite. The problem with this code is that it first uses a +C loaded u value in one xma pair, then leaves it live over several unrelated +C xma pairs, before it uses it again. It should actually be quite possible +C to just swap some aligned xma pairs around. But we should then schedule +C u loads further from the first use. + +C INPUT PARAMETERS +define(`rp',`r32') +define(`up',`r33') +define(`n',`r34') +define(`vp',`r35') + +define(`srp',`r3') + +define(`v0',`f6') +define(`v1',`f7') + +define(`s0',`r14') +define(`acc0',`r15') + +define(`pr0_0',`r16') define(`pr0_1',`r17') +define(`pr0_2',`r18') define(`pr0_3',`r19') + +define(`pr1_0',`r20') define(`pr1_1',`r21') +define(`pr1_2',`r22') define(`pr1_3',`r23') + +define(`acc1_0',`r24') define(`acc1_1',`r25') +define(`acc1_2',`r26') define(`acc1_3',`r27') + +dnl define(`',`r28') +dnl define(`',`r29') +dnl define(`',`r30') +dnl define(`',`r31') + +define(`fp0b_0',`f8') define(`fp0b_1',`f9') +define(`fp0b_2',`f10') define(`fp0b_3',`f11') + +define(`fp1a_0',`f12') define(`fp1a_1',`f13') +define(`fp1a_2',`f14') define(`fp1a_3',`f15') + +define(`fp1b_0',`f32') define(`fp1b_1',`f33') +define(`fp1b_2',`f34') define(`fp1b_3',`f35') + +define(`fp2a_0',`f36') define(`fp2a_1',`f37') +define(`fp2a_2',`f38') define(`fp2a_3',`f39') + +define(`u_0',`f44') define(`u_1',`f45') +define(`u_2',`f46') define(`u_3',`f47') + +define(`ux',`f49') +define(`uy',`f51') + +ASM_START() +PROLOGUE(mpn_mul_2) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32',` + {.mmi; addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I +}{.mmi; nop 1 + nop 1 + zxt4 n = n C I + ;; +}') + + {.mmi; ldf8 ux = [up], 8 C M + ldf8 v0 = [vp], 8 C M + mov r2 = ar.lc C I0 +}{.mmi; nop 1 C M + and r14 = 3, n C M I + add n = -2, n C M I + ;; +}{.mmi; ldf8 uy = [up], 8 C M + ldf8 v1 = [vp] C M + shr.u n = n, 2 C I0 +}{.mmi; nop 1 C M + cmp.eq p10, p0 = 1, r14 C M I + cmp.eq p11, p0 = 2, r14 C M I + ;; +}{.mmi; nop 1 C M + cmp.eq p12, p0 = 3, r14 C M I + mov ar.lc = n C I0 +}{.bbb; (p10) br.dptk L(b01) C B + (p11) br.dptk L(b10) C B + (p12) br.dptk L(b11) C B + ;; +} + ALIGN(32) +L(b00): ldf8 u_1 = [up], 8 + mov acc1_2 = 0 + mov pr1_2 = 0 + mov pr0_3 = 0 + cmp.ne p8, p9 = r0, r0 + ;; + xma.l fp0b_3 = ux, v0, f0 + cmp.ne p12, p13 = r0, r0 + ldf8 u_2 = [up], 8 + xma.hu fp1a_3 = ux, v0, f0 + br.cloop.dptk L(gt4) + + xma.l fp0b_0 = uy, v0, f0 + xma.hu fp1a_0 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_3 + xma.l fp1b_3 = ux, v1, fp1a_3 + xma.hu fp2a_3 = ux, v1, fp1a_3 + ;; + xma.l fp0b_1 = u_1, v0, f0 + xma.hu fp1a_1 = u_1, v0, f0 + ;; + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = uy, v1, fp1a_0 + xma.hu fp2a_0 = uy, v1, fp1a_0 + ;; + getfsig pr1_3 = fp1b_3 + getfsig acc1_3 = fp2a_3 + xma.l fp0b_2 = u_2, v0, f0 + xma.hu fp1a_2 = u_2, v0, f0 + br L(cj4) + +L(gt4): xma.l fp0b_0 = uy, v0, f0 + xma.hu fp1a_0 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_3 + xma.l fp1b_3 = ux, v1, fp1a_3 + ldf8 u_3 = [up], 8 + xma.hu fp2a_3 = ux, v1, fp1a_3 + ;; + xma.l fp0b_1 = u_1, v0, f0 + xma.hu fp1a_1 = u_1, v0, f0 + ;; + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = uy, v1, fp1a_0 + xma.hu fp2a_0 = uy, v1, fp1a_0 + ;; + ldf8 u_0 = [up], 8 + getfsig pr1_3 = fp1b_3 + xma.l fp0b_2 = u_2, v0, f0 + ;; + getfsig acc1_3 = fp2a_3 + xma.hu fp1a_2 = u_2, v0, f0 + br L(00) + + + ALIGN(32) +L(b01): ldf8 u_0 = [up], 8 C M + mov acc1_1 = 0 C M I + mov pr1_1 = 0 C M I + mov pr0_2 = 0 C M I + cmp.ne p6, p7 = r0, r0 C M I + ;; + xma.l fp0b_2 = ux, v0, f0 C F + cmp.ne p10, p11 = r0, r0 C M I + ldf8 u_1 = [up], 8 C M + xma.hu fp1a_2 = ux, v0, f0 C F + ;; + xma.l fp0b_3 = uy, v0, f0 C F + xma.hu fp1a_3 = uy, v0, f0 C F + ;; + getfsig acc0 = fp0b_2 C M + xma.l fp1b_2 = ux, v1,fp1a_2 C F + ldf8 u_2 = [up], 8 C M + xma.hu fp2a_2 = ux, v1,fp1a_2 C F + br.cloop.dptk L(gt5) + + xma.l fp0b_0 = u_0, v0, f0 C F + xma.hu fp1a_0 = u_0, v0, f0 C F + ;; + getfsig pr0_3 = fp0b_3 C M + xma.l fp1b_3 = uy, v1,fp1a_3 C F + xma.hu fp2a_3 = uy, v1,fp1a_3 C F + ;; + getfsig pr1_2 = fp1b_2 C M + getfsig acc1_2 = fp2a_2 C M + xma.l fp0b_1 = u_1, v0, f0 C F + xma.hu fp1a_1 = u_1, v0, f0 C F + br L(cj5) + +L(gt5): xma.l fp0b_0 = u_0, v0, f0 + xma.hu fp1a_0 = u_0, v0, f0 + ;; + getfsig pr0_3 = fp0b_3 + xma.l fp1b_3 = uy, v1, fp1a_3 + xma.hu fp2a_3 = uy, v1, fp1a_3 + ;; + ldf8 u_3 = [up], 8 + getfsig pr1_2 = fp1b_2 + xma.l fp0b_1 = u_1, v0, f0 + ;; + getfsig acc1_2 = fp2a_2 + xma.hu fp1a_1 = u_1, v0, f0 + br L(01) + + + ALIGN(32) +L(b10): br.cloop.dptk L(gt2) + xma.l fp0b_1 = ux, v0, f0 + xma.hu fp1a_1 = ux, v0, f0 + ;; + xma.l fp0b_2 = uy, v0, f0 + xma.hu fp1a_2 = uy, v0, f0 + ;; + stf8 [rp] = fp0b_1, 8 + xma.l fp1b_1 = ux, v1, fp1a_1 + xma.hu fp2a_1 = ux, v1, fp1a_1 + ;; + getfsig acc0 = fp0b_2 + xma.l fp1b_2 = uy, v1, fp1a_2 + xma.hu fp2a_2 = uy, v1, fp1a_2 + ;; + getfsig pr1_1 = fp1b_1 + getfsig acc1_1 = fp2a_1 + mov ar.lc = r2 + getfsig pr1_2 = fp1b_2 + getfsig r8 = fp2a_2 + ;; + add s0 = pr1_1, acc0 + ;; + st8 [rp] = s0, 8 + cmp.ltu p8, p9 = s0, pr1_1 + sub r31 = -1, acc1_1 + ;; + .pred.rel "mutex", p8, p9 + (p8) add acc0 = pr1_2, acc1_1, 1 + (p9) add acc0 = pr1_2, acc1_1 + (p8) cmp.leu p10, p0 = r31, pr1_2 + (p9) cmp.ltu p10, p0 = r31, pr1_2 + ;; + st8 [rp] = acc0, 8 + (p10) add r8 = 1, r8 + br.ret.sptk.many b0 + +L(gt2): ldf8 u_3 = [up], 8 + mov acc1_0 = 0 + mov pr1_0 = 0 + ;; + mov pr0_1 = 0 + xma.l fp0b_1 = ux, v0, f0 + ldf8 u_0 = [up], 8 + xma.hu fp1a_1 = ux, v0, f0 + ;; + xma.l fp0b_2 = uy, v0, f0 + xma.hu fp1a_2 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_1 + xma.l fp1b_1 = ux, v1, fp1a_1 + xma.hu fp2a_1 = ux, v1, fp1a_1 + ;; + ldf8 u_1 = [up], 8 + xma.l fp0b_3 = u_3, v0, f0 + xma.hu fp1a_3 = u_3, v0, f0 + ;; + getfsig pr0_2 = fp0b_2 + xma.l fp1b_2 = uy, v1, fp1a_2 + xma.hu fp2a_2 = uy, v1, fp1a_2 + ;; + ldf8 u_2 = [up], 8 + getfsig pr1_1 = fp1b_1 + ;; + {.mfi; getfsig acc1_1 = fp2a_1 + xma.l fp0b_0 = u_0, v0, f0 + cmp.ne p8, p9 = r0, r0 +}{.mfb; cmp.ne p12, p13 = r0, r0 + xma.hu fp1a_0 = u_0, v0, f0 + br L(10) +} + + ALIGN(32) +L(b11): mov acc1_3 = 0 + mov pr1_3 = 0 + mov pr0_0 = 0 + ldf8 u_2 = [up], 8 + cmp.ne p6, p7 = r0, r0 + br.cloop.dptk L(gt3) + ;; + xma.l fp0b_0 = ux, v0, f0 + xma.hu fp1a_0 = ux, v0, f0 + ;; + cmp.ne p10, p11 = r0, r0 + xma.l fp0b_1 = uy, v0, f0 + xma.hu fp1a_1 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_0 + xma.l fp1b_0 = ux, v1, fp1a_0 + xma.hu fp2a_0 = ux, v1, fp1a_0 + ;; + xma.l fp0b_2 = u_2, v0, f0 + xma.hu fp1a_2 = u_2, v0, f0 + ;; + getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = uy, v1, fp1a_1 + xma.hu fp2a_1 = uy, v1, fp1a_1 + ;; + getfsig pr1_0 = fp1b_0 + getfsig acc1_0 = fp2a_0 + br L(cj3) + +L(gt3): xma.l fp0b_0 = ux, v0, f0 + cmp.ne p10, p11 = r0, r0 + ldf8 u_3 = [up], 8 + xma.hu fp1a_0 = ux, v0, f0 + ;; + xma.l fp0b_1 = uy, v0, f0 + xma.hu fp1a_1 = uy, v0, f0 + ;; + getfsig acc0 = fp0b_0 + xma.l fp1b_0 = ux, v1, fp1a_0 + ldf8 u_0 = [up], 8 + xma.hu fp2a_0 = ux, v1, fp1a_0 + ;; + xma.l fp0b_2 = u_2, v0, f0 + xma.hu fp1a_2 = u_2, v0, f0 + ;; + getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = uy, v1, fp1a_1 + xma.hu fp2a_1 = uy, v1, fp1a_1 + ;; + ldf8 u_1 = [up], 8 + getfsig pr1_0 = fp1b_0 + ;; + getfsig acc1_0 = fp2a_0 + xma.l fp0b_3 = u_3, v0, f0 + xma.hu fp1a_3 = u_3, v0, f0 + br L(11) + + +C *** MAIN LOOP START *** + ALIGN(32) +L(top): C 00 + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + ldf8 u_3 = [up], 8 + getfsig pr1_2 = fp1b_2 + (p8) cmp.leu p6, p7 = acc0, pr0_1 + (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; C 01 + .pred.rel "mutex", p6, p7 + getfsig acc1_2 = fp2a_2 + st8 [rp] = s0, 8 + xma.l fp0b_1 = u_1, v0, f0 + (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + xma.hu fp1a_1 = u_1, v0, f0 + ;; C 02 +L(01): + .pred.rel "mutex", p10, p11 + getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = u_0, v1, fp1a_0 + (p10) add s0 = pr1_1, acc0, 1 + (p11) add s0 = pr1_1, acc0 + xma.hu fp2a_0 = u_0, v1, fp1a_0 + nop 1 + ;; C 03 + .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + ldf8 u_0 = [up], 8 + getfsig pr1_3 = fp1b_3 + (p6) cmp.leu p8, p9 = acc0, pr0_2 + (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; C 04 + .pred.rel "mutex", p8, p9 + getfsig acc1_3 = fp2a_3 + st8 [rp] = s0, 8 + xma.l fp0b_2 = u_2, v0, f0 + (p8) add acc0 = pr0_3, acc1_1, 1 + (p9) add acc0 = pr0_3, acc1_1 + xma.hu fp1a_2 = u_2, v0, f0 + ;; C 05 +L(00): + .pred.rel "mutex", p12, p13 + getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = u_1, v1, fp1a_1 + (p12) add s0 = pr1_2, acc0, 1 + (p13) add s0 = pr1_2, acc0 + xma.hu fp2a_1 = u_1, v1, fp1a_1 + nop 1 + ;; C 06 + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + ldf8 u_1 = [up], 8 + getfsig pr1_0 = fp1b_0 + (p8) cmp.leu p6, p7 = acc0, pr0_3 + (p9) cmp.ltu p6, p7 = acc0, pr0_3 + (p12) cmp.leu p10, p11 = s0, pr1_2 + (p13) cmp.ltu p10, p11 = s0, pr1_2 + ;; C 07 + .pred.rel "mutex", p6, p7 + getfsig acc1_0 = fp2a_0 + st8 [rp] = s0, 8 + xma.l fp0b_3 = u_3, v0, f0 + (p6) add acc0 = pr0_0, acc1_2, 1 + (p7) add acc0 = pr0_0, acc1_2 + xma.hu fp1a_3 = u_3, v0, f0 + ;; C 08 +L(11): + .pred.rel "mutex", p10, p11 + getfsig pr0_2 = fp0b_2 + xma.l fp1b_2 = u_2, v1, fp1a_2 + (p10) add s0 = pr1_3, acc0, 1 + (p11) add s0 = pr1_3, acc0 + xma.hu fp2a_2 = u_2, v1, fp1a_2 + nop 1 + ;; C 09 + .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + ldf8 u_2 = [up], 8 + getfsig pr1_1 = fp1b_1 + (p6) cmp.leu p8, p9 = acc0, pr0_0 + (p7) cmp.ltu p8, p9 = acc0, pr0_0 + (p10) cmp.leu p12, p13 = s0, pr1_3 + (p11) cmp.ltu p12, p13 = s0, pr1_3 + ;; C 10 + .pred.rel "mutex", p8, p9 + getfsig acc1_1 = fp2a_1 + st8 [rp] = s0, 8 + xma.l fp0b_0 = u_0, v0, f0 + (p8) add acc0 = pr0_1, acc1_3, 1 + (p9) add acc0 = pr0_1, acc1_3 + xma.hu fp1a_0 = u_0, v0, f0 + ;; C 11 +L(10): + .pred.rel "mutex", p12, p13 + getfsig pr0_3 = fp0b_3 + xma.l fp1b_3 = u_3, v1, fp1a_3 + (p12) add s0 = pr1_0, acc0, 1 + (p13) add s0 = pr1_0, acc0 + xma.hu fp2a_3 = u_3, v1, fp1a_3 + br.cloop.dptk L(top) + ;; +C *** MAIN LOOP END *** + + .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_2 = fp1b_2 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_1 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; +} .pred.rel "mutex", p6, p7 + {.mfi; getfsig acc1_2 = fp2a_2 + xma.l fp0b_1 = u_1, v0, f0 + nop 1 +}{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + xma.hu fp1a_1 = u_1, v0, f0 + ;; +} +L(cj5): + .pred.rel "mutex", p10, p11 + {.mfi; getfsig pr0_0 = fp0b_0 + xma.l fp1b_0 = u_0, v1, fp1a_0 + (p10) add s0 = pr1_1, acc0, 1 +}{.mfi; (p11) add s0 = pr1_1, acc0 + xma.hu fp2a_0 = u_0, v1, fp1a_0 + nop 1 + ;; +} .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + {.mmi; getfsig pr1_3 = fp1b_3 + st8 [rp] = s0, 8 + (p6) cmp.leu p8, p9 = acc0, pr0_2 +}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mfi; getfsig acc1_3 = fp2a_3 + xma.l fp0b_2 = u_2, v0, f0 + nop 1 +}{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 + (p9) add acc0 = pr0_3, acc1_1 + xma.hu fp1a_2 = u_2, v0, f0 + ;; +} +L(cj4): + .pred.rel "mutex", p12, p13 + {.mfi; getfsig pr0_1 = fp0b_1 + xma.l fp1b_1 = u_1, v1, fp1a_1 + (p12) add s0 = pr1_2, acc0, 1 +}{.mfi; (p13) add s0 = pr1_2, acc0 + xma.hu fp2a_1 = u_1, v1, fp1a_1 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_0 = fp1b_0 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_3 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 + (p12) cmp.leu p10, p11 = s0, pr1_2 + (p13) cmp.ltu p10, p11 = s0, pr1_2 + ;; +} .pred.rel "mutex", p6, p7 + {.mmi; getfsig acc1_0 = fp2a_0 + (p6) add acc0 = pr0_0, acc1_2, 1 + (p7) add acc0 = pr0_0, acc1_2 + ;; +} +L(cj3): + .pred.rel "mutex", p10, p11 + {.mfi; getfsig pr0_2 = fp0b_2 + xma.l fp1b_2 = u_2, v1, fp1a_2 + (p10) add s0 = pr1_3, acc0, 1 +}{.mfi; (p11) add s0 = pr1_3, acc0 + xma.hu fp2a_2 = u_2, v1, fp1a_2 + nop 1 + ;; +} .pred.rel "mutex", p6, p7 + .pred.rel "mutex", p10, p11 + {.mmi; getfsig pr1_1 = fp1b_1 + st8 [rp] = s0, 8 + (p6) cmp.leu p8, p9 = acc0, pr0_0 +}{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 + (p10) cmp.leu p12, p13 = s0, pr1_3 + (p11) cmp.ltu p12, p13 = s0, pr1_3 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; getfsig acc1_1 = fp2a_1 + (p8) add acc0 = pr0_1, acc1_3, 1 + (p9) add acc0 = pr0_1, acc1_3 + ;; +} .pred.rel "mutex", p12, p13 + {.mmi; (p12) add s0 = pr1_0, acc0, 1 + (p13) add s0 = pr1_0, acc0 + nop 1 + ;; +} .pred.rel "mutex", p8, p9 + .pred.rel "mutex", p12, p13 + {.mmi; getfsig pr1_2 = fp1b_2 + st8 [rp] = s0, 8 + (p8) cmp.leu p6, p7 = acc0, pr0_1 +}{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 + (p12) cmp.leu p10, p11 = s0, pr1_0 + (p13) cmp.ltu p10, p11 = s0, pr1_0 + ;; +} .pred.rel "mutex", p6, p7 + {.mmi; getfsig r8 = fp2a_2 + (p6) add acc0 = pr0_2, acc1_0, 1 + (p7) add acc0 = pr0_2, acc1_0 + ;; +} .pred.rel "mutex", p10, p11 + {.mmi; (p10) add s0 = pr1_1, acc0, 1 + (p11) add s0 = pr1_1, acc0 + (p6) cmp.leu p8, p9 = acc0, pr0_2 + ;; +} .pred.rel "mutex", p10, p11 + {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 + (p10) cmp.leu p12, p13 = s0, pr1_1 + (p11) cmp.ltu p12, p13 = s0, pr1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; st8 [rp] = s0, 8 + (p8) add acc0 = pr1_2, acc1_1, 1 + (p9) add acc0 = pr1_2, acc1_1 + ;; +} .pred.rel "mutex", p8, p9 + {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 + (p9) cmp.ltu p10, p11 = acc0, pr1_2 + (p12) add acc0 = 1, acc0 + ;; +}{.mmi; st8 [rp] = acc0, 8 + (p12) cmpeqor p10, p0 = 0, acc0 + nop 1 + ;; +}{.mib; (p10) add r8 = 1, r8 + mov ar.lc = r2 + br.ret.sptk.many b0 +} +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/popcount.asm b/gmp-6.3.0/mpn/ia64/popcount.asm new file mode 100644 index 0000000..c0b5c5c --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/popcount.asm @@ -0,0 +1,200 @@ +dnl IA-64 mpn_popcount -- mpn population count. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 1.5 +C Itanium 2: 1 + +C INPUT PARAMETERS +define(`up', `r32') +define(`n', `r33') + +define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19') +define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31') +define(`s',`r8') + + +ASM_START() +PROLOGUE(mpn_popcount) + .prologue +ifdef(`HAVE_ABI_32', +` addp4 up = 0, up C M I + nop.m 0 + zxt4 n = n C I + ;; +') + + {.mmi; add r9 = 512, up C prefetch pointer M I + ld8 r10 = [up], 8 C load first limb M01 + mov.i r2 = ar.lc C save ar.lc I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p14 = 4, n C small count? M I + add n = -5, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + + +.Lb00: ld8 u1 = [up], 8 C M01 + shr.u n = n, 2 C I0 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + popcnt c0 = r10 C I0 + mov.i ar.lc = n C I0 + ;; + ld8 u3 = [up], 8 C M01 + popcnt c1 = u1 C I0 + (p15) br.cond.dptk .grt4 C B + ;; + nop.m 0 C - + nop.m 0 C - + popcnt c2 = u2 C I0 + ;; + mov s = c0 C M I + popcnt c3 = u3 C I0 + br .Lcj4 C B + +.grt4: ld8 u0 = [up], 8 C M01 + popcnt c2 = u2 C I0 + br .LL00 C B + + +.Lb01: + popcnt s = r10 C I0 + (p14) br.ret.sptk.many b0 C B + +.grt1: ld8 u0 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 u1 = [up], 8 C M01 + mov.i ar.lc = n C I0 + ;; + ld8 u2 = [up], 8 C M01 + popcnt c0 = u0 C I0 + mov c3 = 0 C I0 + + ;; + ld8 u3 = [up], 8 C M01 + popcnt c1 = u1 C I0 + br.cloop.dptk .Loop C B + br .Lend C B + + +.Lb10: ld8 u3 = [up], 8 C M01 + shr.u n = n, 2 C I0 + (p15) br.cond.dptk .grt2 C B + + popcnt s = r10 C I0 + ;; + popcnt c3 = u3 C I0 + br .Lcj2 C B + +.grt2: ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C I0 + popcnt c2 = r10 C I0 + ;; + ld8 u1 = [up], 8 C M01 + popcnt c3 = u3 C I0 + mov s = 0 C M I + ;; + ld8 u2 = [up], 8 C M01 + popcnt c0 = u0 C I0 + br .LL10 C B + + +.Lb11: ld8 u2 = [up], 8 C M01 + shr.u n = n, 2 C I0 + mov s = 0 C M I + ;; + ld8 u3 = [up], 8 C M01 + popcnt s = r10 C I0 + (p15) br.cond.dptk .grt3 C B + + popcnt c2 = u2 C I0 + ;; + popcnt c3 = u3 C I0 + br .Lcj3 C B + +.grt3: ld8 u0 = [up], 8 C M01 + popcnt c2 = u2 C I0 + mov.i ar.lc = n C I0 + mov c1 = 0 + ;; + ld8 u1 = [up], 8 C M01 + popcnt c3 = u3 C I0 + br .LL11 C B + + +.Loop: ld8 u0 = [up], 8 C M01 + popcnt c2 = u2 C I0 + add s = s, c3 C M I + ;; +.LL00: ld8 u1 = [up], 8 C M01 + popcnt c3 = u3 C I0 + add s = s, c0 C M I + ;; +.LL11: ld8 u2 = [up], 8 C M01 + popcnt c0 = u0 C I0 + add s = s, c1 C M I + ;; +.LL10: ld8 u3 = [up], 8 C M01 + popcnt c1 = u1 C I0 + add s = s, c2 C M I + lfetch [r9], 32 C M01 + nop.m 0 C - + br.cloop.dptk .Loop C B + ;; + +.Lend: popcnt c2 = u2 C I0 + add s = s, c3 C M I + ;; + popcnt c3 = u3 C I0 + add s = s, c0 C M I + ;; +.Lcj4: add s = s, c1 C M I + ;; +.Lcj3: add s = s, c2 C M I + ;; +.Lcj2: add s = s, c3 C M I + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm b/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm new file mode 100644 index 0000000..3c7defb --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/rsh1aors_n.asm @@ -0,0 +1,447 @@ +dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2.5 +C Itanium 2: 1.5 + +C TODO +C * Rewrite function entry code using aorslsh1_n.asm style. +C * Micro-optimize feed-in and wind-down code. + +C INPUT PARAMETERS +define(`rp',`r32') +define(`up',`r33') +define(`vp',`r34') +define(`n',`r35') + +ifdef(`OPERATION_rsh1add_n',` + define(ADDSUB, add) + define(PRED, ltu) + define(INCR, 1) + define(LIM, -1) + define(func, mpn_rsh1add_n) +') +ifdef(`OPERATION_rsh1sub_n',` + define(ADDSUB, sub) + define(PRED, gtu) + define(INCR, -1) + define(LIM, 0) + define(func, mpn_rsh1sub_n) +') + +C Some useful aliases for registers we use +define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') +define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21') +define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25') +define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + addp4 vp = 0, vp C M I + nop.m 0 + nop.m 0 + zxt4 n = n C I + ;; +') + {.mmi; ld8 r11 = [vp], 8 C M01 + ld8 r10 = [up], 8 C M01 + mov.i r2 = ar.lc C I0 +}{.mmi; and r14 = 3, n C M I + cmp.lt p15, p0 = 4, n C M I + add n = -4, n C M I + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + cmp.eq p8, p0 = 3, r14 C M I +}{.bbb + (p6) br.dptk .Lb01 C B + (p7) br.dptk .Lb10 C B + (p8) br.dptk .Lb11 C B +} + +.Lb00: ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ADDSUB w3 = r10, r11 C M I + ;; + ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + (p15) br.dpnt .grt4 C B + ;; + + cmp.PRED p7, p0 = w3, r10 C M I + and r8 = 1, w3 C M I + ADDSUB w0 = u0, v0 C M I + ;; + cmp.PRED p8, p0 = w0, u0 C M I + ADDSUB w1 = u1, v1 C M I + ;; + cmp.PRED p9, p0 = w1, u1 C M I + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + ;; + shrp x3 = w0, w3, 1 C I0 + ADDSUB w2 = u2, v2 C M I + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + br .Lcj4 C B + +.grt4: ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, r10 C M I + ld8 u3 = [up], 8 C M01 + and r8 = 1, w3 C M I + ;; + ADDSUB w0 = u0, v0 C M I + ld8 v0 = [vp], 8 C M01 + add n = -1, n + ;; + cmp.PRED p8, p0 = w0, u0 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + ;; + ld8 v1 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + cmp.PRED p9, p0 = w1, u1 C M I + ld8 u1 = [up], 8 C M01 + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + ;; + ADDSUB w2 = u2, v2 C M I + ld8 v2 = [vp], 8 C M01 + shrp x3 = w0, w3, 1 C I0 + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + br .LL00 C B + + +.Lb01: ADDSUB w2 = r10, r11 C M I + shr.u n = n, 2 C I0 + (p15) br.dpnt .grt1 C B + ;; + + cmp.PRED p6, p7 = w2, r10 C M I + shr.u x2 = w2, 1 C I0 + and r8 = 1, w2 C M I + ;; + (p6) dep x2 = -1, x2, 63, 1 C I0 + br .Lcj1 C B + +.grt1: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ;; + ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C FIXME swap with next I0 + ;; + ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + ;; + ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + cmp.PRED p6, p0 = w2, r10 C M I + and r8 = 1, w2 C M I + ADDSUB w3 = u3, v3 C M I + br.cloop.dptk .grt5 C B + ;; + + cmp.PRED p7, p0 = w3, u3 C M I + ;; + ADDSUB w0 = u0, v0 C M I + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ;; + cmp.PRED p8, p0 = w0, u0 C M I + shrp x2 = w3, w2, 1 C I0 + ADDSUB w1 = u1, v1 C M I + ;; + cmp.PRED p9, p0 = w1, u1 C M I + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + br .Lcj5 C B + +.grt5: ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, u3 C M I + ld8 u3 = [up], 8 C M01 + ;; + ADDSUB w0 = u0, v0 C M I + ld8 v0 = [vp], 8 C M01 + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ;; + cmp.PRED p8, p0 = w0, u0 C M I + shrp x2 = w3, w2, 1 C I0 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + ;; + ld8 v1 = [vp], 8 C M01 + cmp.PRED p9, p0 = w1, u1 C M I + ld8 u1 = [up], 8 C M01 + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + br .LL01 C B + + +.Lb10: ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ADDSUB w1 = r10, r11 C M I + (p15) br.dpnt .grt2 C B + ;; + + cmp.PRED p9, p0 = w1, r10 C M I + and r8 = 1, w1 C M I + ADDSUB w2 = u2, v2 C M I + ;; + cmp.PRED p6, p0 = w2, u2 C M I + ;; + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; + shrp x1 = w2, w1, 1 C I0 + shr.u x2 = w2, 1 C I0 + br .Lcj2 C B + +.grt2: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ;; + ld8 v0 = [vp], 8 C M01 + ld8 u0 = [up], 8 C M01 + mov.i ar.lc = n C I0 + ;; + ld8 v1 = [vp], 8 C M01 + cmp.PRED p9, p0 = w1, r10 C M I + ld8 u1 = [up], 8 C M01 + and r8 = 1, w1 C M I + ;; + ADDSUB w2 = u2, v2 C M I + ld8 v2 = [vp], 8 C M01 + ;; + cmp.PRED p6, p0 = w2, u2 C M I + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + br.cloop.dptk .grt6 C B + ;; + + cmp.PRED p7, p0 = w3, u3 C M I + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; + shrp x1 = w2, w1, 1 C I0 + ADDSUB w0 = u0, v0 C M I + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + br .Lcj6 C B + +.grt6: ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, u3 C M I + ld8 u3 = [up], 8 C M01 + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; + shrp x1 = w2, w1, 1 C I0 + ADDSUB w0 = u0, v0 C M I + ld8 v0 = [vp], 8 C M01 + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + br .LL10 C B + + +.Lb11: ld8 v1 = [vp], 8 C M01 + ld8 u1 = [up], 8 C M01 + shr.u n = n, 2 C I0 + ;; + ld8 v2 = [vp], 8 C M01 + ld8 u2 = [up], 8 C M01 + ADDSUB w0 = r10, r11 C M I + (p15) br.dpnt .grt3 C B + ;; + + cmp.PRED p8, p0 = w0, r10 C M I + ADDSUB w1 = u1, v1 C M I + and r8 = 1, w0 C M I + ;; + cmp.PRED p9, p0 = w1, u1 C M I + ;; + ADDSUB w2 = u2, v2 C M I + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + ;; + cmp.PRED p6, p0 = w2, u2 C M I + shrp x0 = w1, w0, 1 C I0 + ;; + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + br .Lcj3 C B + +.grt3: ld8 v3 = [vp], 8 C M01 + ld8 u3 = [up], 8 C M01 + ;; + ld8 v0 = [vp], 8 C M01 + mov.i ar.lc = n C I0 + cmp.PRED p8, p0 = w0, r10 C M I + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + and r8 = 1, w0 C M I + ;; + ld8 v1 = [vp], 8 C M01 + cmp.PRED p9, p0 = w1, u1 C M I + ld8 u1 = [up], 8 C M01 + ;; + ADDSUB w2 = u2, v2 C M I + ld8 v2 = [vp], 8 C M01 + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + ;; + cmp.PRED p6, p0 = w2, u2 C M I + shrp x0 = w1, w0, 1 C I0 + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + br.cloop.dptk .grt7 C B + ;; + + cmp.PRED p7, p0 = w3, u3 C M I + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + br .Lcj7 C B + +.grt7: ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, u3 C M I + ld8 u3 = [up], 8 C M01 + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + br .LL11 C B + + +C *** MAIN LOOP START *** + ALIGN(32) +.Loop: st8 [rp] = x3, 8 C M23 + ld8 v3 = [vp], 8 C M01 + cmp.PRED p7, p0 = w3, u3 C M I + ld8 u3 = [up], 8 C M01 + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +.LL11: st8 [rp] = x0, 8 C M23 + shrp x1 = w2, w1, 1 C I0 + ADDSUB w0 = u0, v0 C M I + ld8 v0 = [vp], 8 C M01 + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ;; +.LL10: cmp.PRED p8, p0 = w0, u0 C M I + shrp x2 = w3, w2, 1 C I0 + nop.b 0 + ld8 u0 = [up], 8 C M01 + ADDSUB w1 = u1, v1 C M I + nop.b 0 + ;; + st8 [rp] = x1, 8 C M23 + ld8 v1 = [vp], 8 C M01 + cmp.PRED p9, p0 = w1, u1 C M I + ld8 u1 = [up], 8 C M01 + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + ;; +.LL01: st8 [rp] = x2, 8 C M23 + shrp x3 = w0, w3, 1 C I0 + ADDSUB w2 = u2, v2 C M I + ld8 v2 = [vp], 8 C M01 + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + ;; +.LL00: cmp.PRED p6, p0 = w2, u2 C M I + shrp x0 = w1, w0, 1 C I0 + nop.b 0 + ld8 u2 = [up], 8 C M01 + ADDSUB w3 = u3, v3 C M I + br.cloop.dptk .Loop C B + ;; +C *** MAIN LOOP END *** + +.Lskip: st8 [rp] = x3, 8 C M23 + cmp.PRED p7, p0 = w3, u3 C M I + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +.Lcj7: st8 [rp] = x0, 8 C M23 + shrp x1 = w2, w1, 1 C I0 + ADDSUB w0 = u0, v0 C M I + (p6) cmp.eq.or p7, p0 = LIM, w3 C M I + (p6) add w3 = INCR, w3 C M I + ;; +.Lcj6: cmp.PRED p8, p0 = w0, u0 C M I + shrp x2 = w3, w2, 1 C I0 + ADDSUB w1 = u1, v1 C M I + ;; + st8 [rp] = x1, 8 C M23 + cmp.PRED p9, p0 = w1, u1 C M I + (p7) cmp.eq.or p8, p0 = LIM, w0 C M I + (p7) add w0 = INCR, w0 C M I + ;; +.Lcj5: st8 [rp] = x2, 8 C M23 + shrp x3 = w0, w3, 1 C I0 + ADDSUB w2 = u2, v2 C M I + (p8) cmp.eq.or p9, p0 = LIM, w1 C M I + (p8) add w1 = INCR, w1 C M I + ;; +.Lcj4: cmp.PRED p6, p0 = w2, u2 C M I + shrp x0 = w1, w0, 1 C I0 + ;; + st8 [rp] = x3, 8 C M23 + (p9) cmp.eq.or p6, p0 = LIM, w2 C M I + (p9) add w2 = INCR, w2 C M I + ;; +.Lcj3: st8 [rp] = x0, 8 C M23 + shrp x1 = w2, w1, 1 C I0 + shr.u x2 = w2, 1 C I0 + ;; +.Lcj2: st8 [rp] = x1, 8 C M23 + (p6) dep x2 = -1, x2, 63, 1 C I0 + ;; +.Lcj1: st8 [rp] = x2 C M23 + mov.i ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/sec_tabselect.asm b/gmp-6.3.0/mpn/ia64/sec_tabselect.asm new file mode 100644 index 0000000..9b11cde --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/sec_tabselect.asm @@ -0,0 +1,148 @@ +dnl IA-64 mpn_sec_tabselect. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 2.5 + +C NOTES +C * Using software pipelining could trivially yield 2 c/l without unrolling, +C or 1+epsilon with unrolling. (This code was modelled after the powerpc64 +C code, for simplicity.) + +C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `r32') +define(`tp', `r33') +define(`n', `r34') +define(`nents', `r35') +define(`which', `r36') + +define(`mask', `r8') + +define(`rp1', `r32') +define(`tp1', `r33') +define(`rp2', `r14') +define(`tp2', `r15') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32',` + {.mmi; addp4 rp = 0, rp C M I + addp4 tp = 0, tp C M I + zxt4 n = n C I +}{.mii; nop 0 + zxt4 nents = nents C I + zxt4 which = which C I + ;; +}') + {.mmi; add rp2 = 8, rp1 + add tp2 = 8, tp1 + add r6 = -2, n + ;; +}{.mmi; cmp.eq p10, p0 = 1, n + and r9 = 1, n C set cr0 for use in inner loop + shr.u r6 = r6, 1 C inner loop count + ;; +}{.mmi; cmp.eq p8, p0 = 0, r9 + sub which = nents, which + shl n = n, 3 + ;; +} +L(outer): + {.mmi; cmp.eq p6, p7 = which, nents C are we at the selected table entry? + nop 0 + mov ar.lc = r6 C I0 + ;; +}{.mmb; + (p6) mov mask = -1 + (p7) mov mask = 0 + (p8) br.dptk L(top) C branch to loop entry if n even + ;; +}{.mmi; ld8 r16 = [tp1], 8 + add tp2 = 8, tp2 + nop 0 + ;; +}{.mmi; ld8 r18 = [rp1] + and r16 = r16, mask + nop 0 + ;; +}{.mmi; andcm r18 = r18, mask + ;; + or r16 = r16, r18 + nop 0 + ;; +}{.mmb; st8 [rp1] = r16, 8 + add rp2 = 8, rp2 + (p10) br.dpnt L(end) +} + ALIGN(32) +L(top): + {.mmi; ld8 r16 = [tp1], 16 + ld8 r17 = [tp2], 16 + nop 0 + ;; +}{.mmi; ld8 r18 = [rp1] + and r16 = r16, mask + nop 0 +}{.mmi; ld8 r19 = [rp2] + and r17 = r17, mask + nop 0 + ;; +}{.mmi; andcm r18 = r18, mask + andcm r19 = r19, mask + nop 0 + ;; +}{.mmi; or r16 = r16, r18 + or r17 = r17, r19 + nop 0 + ;; +}{.mmb; st8 [rp1] = r16, 16 + st8 [rp2] = r17, 16 + br.cloop.dptk L(top) + ;; +} +L(end): + {.mmi; sub rp1 = rp1, n C move rp back to beginning + sub rp2 = rp2, n C move rp back to beginning + cmp.ne p9, p0 = 1, nents +}{.mmb; add nents = -1, nents + nop 0 + (p9) br.dptk L(outer) + ;; +}{.mib; nop 0 + nop 0 + br.ret.sptk.many b0 +} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..727f489 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/sqr_diag_addlsh1.asm @@ -0,0 +1,156 @@ +dnl IA-64 mpn_sqr_diag_addlsh1 + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: ? +C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon + +C Exact performance table. The 2nd line is this code, the 3rd line is ctop- +C less code. In an assembly sqr_basecase, the ctop-full numbers will become a +C few cycles better since we can mitigate the many I0 instructions. +C +C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 +C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating +C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43 + +C We should keep in mind that this code takes linear time in a O(n^2) context +C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become +C around 60. Keeping overhead down for smallish operands (< 10) is more +C important than optimal cycle counts. + +C TODO +C * Make sure we don't depend on uninitialised r-registers, f-registers, or +C * p-registers. +C * Optimise by doing first two loop iterations in function header. + +C INPUT PARAMETERS +define(`rp_param', `r32') define(`rp', `r14') C size: 2n +define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2 +define(`up_param', `r34') define(`up', `r31') C size: n +define(`n', `r35') + +ifdef(`HAVE_ABI_32',` + define(`ABI64', `') + define(`ABI32', `$1') +',` + define(`ABI64', `$1') + define(`ABI32', `') +') + +ASM_START() +PROLOGUE(mpn_sqr_diag_addlsh1) + + .prologue + .save ar.pfs, r2 + .save ar.lc, r3 + .body + + {.mii; alloc r2 = ar.pfs, 4,24,0,24 C M + mov r3 = ar.lc C I0 + ABI64(` nop 4711 ') + ABI32(` zxt4 n = n ') +}{.mmi; ABI64(` mov tp = tp_param ') C M I + ABI32(` addp4 tp = 0, tp_param') C M I + ABI64(` mov up = up_param ') C M I + ABI32(` addp4 up = 0, up_param') C M I + ABI64(` mov rp = rp_param ') C M I + ABI32(` addp4 rp = 0, rp_param') C M I + ;; +}{.mmi; ld8 r36 = [tp], 8 C M + add r20 = -2, n C M I + mov r9 = ar.ec C I0 + ;; +}{.mmi; ld8 r32 = [tp], 8 C M + mov r16 = 0 C M I + mov ar.ec = 7 C I0 + ;; +}{.mmi; nop 4711 + mov r44 = 0 C M I + mov ar.lc = r20 C I0 + ;; +}{.mii; mov r33 = 0 + mov r10 = pr C I0 + mov pr.rot = 0x30000 C I0 + ;; +} br.cexit.spnt.few.clr L(end) + +dnl *** MAIN LOOP START *** + ALIGN(32) +L(top): + {.mfi; (p18) ldf8 f33 = [up], 8 C M + (p20) xma.l f36 = f35, f35, f42 C F + (p41) cmpequc p50, p0 = -1, r44 C M I +}{.mfi; setfsig f40 = r16 C M23 + (p20) xma.hu f38 = f35, f35, f42 C F + (p23) add r50 = r41, r49 C M I + ;; +}{.mmi; (p16) ld8 r36 = [tp], 8 C M + (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I + (p19) shrp r45 = r38, r35, 63 C non-critical I0 +}{.mmi; (p21) getfsig r39 = f39 C hi M2 + (p24) st8 [rp] = r51, 8 C hi M23 + (p41) add r44 = 1, r44 C M I + ;; +}{.mmi; (p16) ld8 r32 = [tp], 8 C M + (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I + (p17) shrp r16 = r33, r37, 63 C critical I0 +}{.mmi; (p21) getfsig r42 = f37 C lo M2 + (p23) st8 [rp] = r44, 8 C lo M23 + (p50) add r50 = 1, r50 C M I + ;; +} br.ctop.sptk.few.clr L(top) C B +dnl *** MAIN LOOP END *** + ;; +L(end): + {.mmi; nop 4711 + (p41) add r44 = 1, r44 C M I + shr.u r48 = r39, 63 C I0 + ;; +}{.mmi; st8 [rp] = r51, 8 C M23 + (p41) cmpequc p6, p0 = 0, r44 C M I + add r50 = r41, r48 C M I + ;; +}{.mmi; st8 [rp] = r44, 8 C M23 + (p6) add r50 = 1, r50 C M I + mov ar.lc = r3 C I0 + ;; +}{.mii; st8 [rp] = r50 C M23 + mov ar.ec = r9 C I0 + mov pr = r10 C I0 + ;; +}{.mib; nop 4711 + mov ar.pfs = r2 C I0 + br.ret.sptk.many b0 C B +} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/ia64/submul_1.asm b/gmp-6.3.0/mpn/ia64/submul_1.asm new file mode 100644 index 0000000..cb2a552 --- /dev/null +++ b/gmp-6.3.0/mpn/ia64/submul_1.asm @@ -0,0 +1,647 @@ +dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the +dnl result from a second limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2000-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 4.0 +C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l) + +C TODO +C * Optimize feed-in and wind-down code, both for speed and code size. +C * Handle low limb input and results specially, using a common stf8 in the +C epilogue. +C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in +C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and +C save a cycle. + +C INPUT PARAMETERS +define(`rp', `r32') +define(`up', `r33') +define(`n', `r34') +define(`vl', `r35') + +ASM_START() +PROLOGUE(mpn_submul_1) + .prologue + .save ar.lc, r2 + .body + +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + zxt4 n = n C I + ;; +') +{.mmi + mov r10 = rp C M I + mov r9 = up C M I + sub vl = r0, vl C M I negate vl +} +{.mmi + ldf8 f8 = [rp], 8 C M + ldf8 f7 = [up], 8 C M + add r19 = -1, n C M I n - 1 + ;; +} +{.mmi + cmp.eq p6, p0 = 0, vl C M I + mov r8 = 0 C M I zero cylimb + mov r2 = ar.lc C I0 +} +{.mmi + setf.sig f6 = vl C M2 M3 + and r14 = 3, n C M I + shr.u r19 = r19, 2 C I0 + ;; +} +{.mmb + nop 0 + cmp.eq p10, p0 = 0, r14 C M I + (p6) br.spnt .Ldone C B vl == 0 +} +{.mmi + cmp.eq p11, p0 = 2, r14 C M I + cmp.eq p12, p0 = 3, r14 C M I + mov ar.lc = r19 C I0 +} +{.bbb + (p10) br.dptk .Lb00 C B + (p11) br.dptk .Lb10 C B + (p12) br.dptk .Lb11 C B + ;; +} + +.Lb01: br.cloop.dptk .grt1 + + xma.l f39 = f7, f6, f8 + xma.hu f43 = f7, f6, f8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj1 + +.grt1: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + ;; + ldf8 f46 = [rp], 8 + xma.l f39 = f7, f6, f8 + ldf8 f34 = [up], 8 + xma.hu f43 = f7, f6, f8 + ;; + ldf8 f47 = [rp], 8 + xma.l f36 = f32, f6, f44 + ldf8 f35 = [up], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt5 + ;; + + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + getf.sig r26 = f38 C lo + ld8 r23 = [r9], 8 + br .Lcj5 + +.grt5: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .Loop + br .Lend + + +.Lb10: ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt2 + + xma.l f38 = f7, f6, f8 + xma.hu f42 = f7, f6, f8 + ;; + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r26 = f38 C lo + getf.sig r30 = f42 C hi + ld8 r23 = [r9], 8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj2 + +.grt2: ldf8 f44 = [rp], 8 + ldf8 f32 = [up], 8 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + xma.l f38 = f7, f6, f8 + xma.hu f42 = f7, f6, f8 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt6 + + getf.sig r30 = f42 C hi + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + br .Lcj6 + +.grt6: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + br .LL10 + + +.Lb11: ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + ldf8 f47 = [rp], 8 + ldf8 f35 = [up], 8 + br.cloop.dptk .grt3 + + xma.l f37 = f7, f6, f8 + xma.hu f41 = f7, f6, f8 + ;; + xma.l f38 = f34, f6, f46 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + ld8 r22 = [r9], 8 + ;; + getf.sig r26 = f38 C lo + getf.sig r30 = f42 C hi + ld8 r23 = [r9], 8 + ;; + getf.sig r27 = f39 C lo + getf.sig r31 = f43 C hi + ld8 r20 = [r9], 8 + br .Lcj3 + +.grt3: ldf8 f44 = [rp], 8 + xma.l f37 = f7, f6, f8 + ldf8 f32 = [up], 8 + xma.hu f41 = f7, f6, f8 + ;; + ldf8 f45 = [rp], 8 + xma.l f38 = f34, f6, f46 + ldf8 f33 = [up], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + getf.sig r25 = f37 C lo + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt7 + ;; + + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r31 = f43 C hi + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + br .Lcj7 + +.grt7: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + br .LL11 + + +.Lb00: ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + ;; + ldf8 f46 = [rp], 8 + ldf8 f34 = [up], 8 + ;; + ldf8 f47 = [rp], 8 + xma.l f36 = f7, f6, f8 + ldf8 f35 = [up], 8 + xma.hu f40 = f7, f6, f8 + br.cloop.dptk .grt4 + + xma.l f37 = f33, f6, f45 + xma.hu f41 = f33, f6, f45 + ;; + getf.sig r24 = f36 C lo + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + getf.sig r28 = f40 C hi + xma.l f39 = f35, f6, f47 + getf.sig r25 = f37 C lo + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + getf.sig r29 = f41 C hi + getf.sig r26 = f38 C lo + ld8 r23 = [r9], 8 + ;; + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + ld8 r20 = [r9], 8 + br .Lcj4 + +.grt4: ldf8 f44 = [rp], 8 + xma.l f37 = f33, f6, f45 + ldf8 f32 = [up], 8 + xma.hu f41 = f33, f6, f45 + ;; + ldf8 f45 = [rp], 8 + ldf8 f33 = [up], 8 + xma.l f38 = f34, f6, f46 + getf.sig r24 = f36 C lo + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + ;; + ldf8 f46 = [rp], 8 + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + xma.l f39 = f35, f6, f47 + getf.sig r25 = f37 C lo + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + ;; + ldf8 f47 = [rp], 8 + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + ;; + getf.sig r26 = f38 C lo + xma.l f36 = f32, f6, f44 + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + br.cloop.dptk .grt8 + ;; + + getf.sig r30 = f42 C hi + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + br .Lcj8 + +.grt8: ldf8 f44 = [rp], 8 + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + ;; + getf.sig r27 = f39 C lo + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + br .LL00 + + ALIGN(32) +.Loop: +{.mmi + ldf8 f44 = [rp], 8 + cmp.ltu p6, p0 = r27, r8 C lo cmp + sub r14 = r27, r8 C lo sub +} +{.mmi + getf.sig r30 = f42 C hi + ldf8 f32 = [up], 8 + sub r8 = r20, r31 C hi sub + ;; C 01 +} +{.mmf + getf.sig r27 = f39 C lo + st8 [r10] = r14, 8 + xma.l f37 = f33, f6, f45 +} +{.mfi + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + (p6) add r8 = 1, r8 + ;; C 02 +} +{.mmi +.LL00: ldf8 f45 = [rp], 8 + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 +} +{.mmi + getf.sig r31 = f43 C hi + ldf8 f33 = [up], 8 + sub r8 = r21, r28 + ;; C 03 +} +{.mmf + getf.sig r24 = f36 C lo + st8 [r10] = r14, 8 + xma.l f38 = f34, f6, f46 +} +{.mfi + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + (p6) add r8 = 1, r8 + ;; C 04 +} +{.mmi +.LL11: ldf8 f46 = [rp], 8 + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 +} +{.mmi + getf.sig r28 = f40 C hi + ldf8 f34 = [up], 8 + sub r8 = r22, r29 + ;; C 05 +} +{.mmf + getf.sig r25 = f37 C lo + st8 [r10] = r14, 8 + xma.l f39 = f35, f6, f47 +} +{.mfi + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + (p6) add r8 = 1, r8 + ;; C 06 +} +{.mmi +.LL10: ldf8 f47 = [rp], 8 + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 +} +{.mmi + getf.sig r29 = f41 C hi + ldf8 f35 = [up], 8 + sub r8 = r23, r30 + ;; C 07 +} +{.mmf + getf.sig r26 = f38 C lo + st8 [r10] = r14, 8 + xma.l f36 = f32, f6, f44 +} +{.mfi + ld8 r23 = [r9], 8 + xma.hu f40 = f32, f6, f44 + (p6) add r8 = 1, r8 +} + br.cloop.dptk .Loop + ;; + +.Lend: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + getf.sig r30 = f42 + sub r8 = r20, r31 + ;; + getf.sig r27 = f39 + st8 [r10] = r14, 8 + xma.l f37 = f33, f6, f45 + ld8 r20 = [r9], 8 + xma.hu f41 = f33, f6, f45 + (p6) add r8 = 1, r8 + ;; +.Lcj8: + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 + getf.sig r31 = f43 + sub r8 = r21, r28 + ;; + getf.sig r24 = f36 + st8 [r10] = r14, 8 + xma.l f38 = f34, f6, f46 + ld8 r21 = [r9], 8 + xma.hu f42 = f34, f6, f46 + (p6) add r8 = 1, r8 + ;; +.Lcj7: + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 + getf.sig r28 = f40 + sub r8 = r22, r29 + ;; + getf.sig r25 = f37 + st8 [r10] = r14, 8 + xma.l f39 = f35, f6, f47 + ld8 r22 = [r9], 8 + xma.hu f43 = f35, f6, f47 + (p6) add r8 = 1, r8 + ;; +.Lcj6: + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 + getf.sig r29 = f41 + sub r8 = r23, r30 + ;; + getf.sig r26 = f38 + st8 [r10] = r14, 8 + ld8 r23 = [r9], 8 + (p6) add r8 = 1, r8 + ;; +.Lcj5: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + getf.sig r30 = f42 + sub r8 = r20, r31 + ;; + getf.sig r27 = f39 + st8 [r10] = r14, 8 + ld8 r20 = [r9], 8 + (p6) add r8 = 1, r8 + ;; +.Lcj4: + cmp.ltu p6, p0 = r24, r8 + sub r14 = r24, r8 + getf.sig r31 = f43 + sub r8 = r21, r28 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj3: + cmp.ltu p6, p0 = r25, r8 + sub r14 = r25, r8 + sub r8 = r22, r29 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj2: + cmp.ltu p6, p0 = r26, r8 + sub r14 = r26, r8 + sub r8 = r23, r30 + ;; + st8 [r10] = r14, 8 + (p6) add r8 = 1, r8 + ;; +.Lcj1: + cmp.ltu p6, p0 = r27, r8 + sub r14 = r27, r8 + sub r8 = r20, r31 + ;; + st8 [r10] = r14, 8 + mov ar.lc = r2 + (p6) add r8 = 1, r8 + br.ret.sptk.many b0 +.Ldone: mov ar.lc = r2 + br.ret.sptk.many b0 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/invert.c b/gmp-6.3.0/mpn/invert.c new file mode 120000 index 0000000..c3c7bc4 --- /dev/null +++ b/gmp-6.3.0/mpn/invert.c @@ -0,0 +1 @@ +../mpn/generic/invert.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/invertappr.c b/gmp-6.3.0/mpn/invertappr.c new file mode 120000 index 0000000..baebc1b --- /dev/null +++ b/gmp-6.3.0/mpn/invertappr.c @@ -0,0 +1 @@ +../mpn/generic/invertappr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/ior_n.c b/gmp-6.3.0/mpn/ior_n.c new file mode 120000 index 0000000..0a553d9 --- /dev/null +++ b/gmp-6.3.0/mpn/ior_n.c @@ -0,0 +1 @@ +../mpn/generic/logops_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/iorn_n.c b/gmp-6.3.0/mpn/iorn_n.c new file mode 120000 index 0000000..0a553d9 --- /dev/null +++ b/gmp-6.3.0/mpn/iorn_n.c @@ -0,0 +1 @@ +../mpn/generic/logops_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/jacbase.c b/gmp-6.3.0/mpn/jacbase.c new file mode 120000 index 0000000..e05b9ea --- /dev/null +++ b/gmp-6.3.0/mpn/jacbase.c @@ -0,0 +1 @@ +../mpn/generic/jacbase.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/jacobi.c b/gmp-6.3.0/mpn/jacobi.c new file mode 120000 index 0000000..293d08a --- /dev/null +++ b/gmp-6.3.0/mpn/jacobi.c @@ -0,0 +1 @@ +../mpn/generic/jacobi.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/jacobi_2.c b/gmp-6.3.0/mpn/jacobi_2.c new file mode 120000 index 0000000..5a39ad8 --- /dev/null +++ b/gmp-6.3.0/mpn/jacobi_2.c @@ -0,0 +1 @@ +../mpn/generic/jacobi_2.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/jacobitab.h b/gmp-6.3.0/mpn/jacobitab.h new file mode 100644 index 0000000..4bdbfcc --- /dev/null +++ b/gmp-6.3.0/mpn/jacobitab.h @@ -0,0 +1,13 @@ + 0, 0, 0, 0, 0,12, 8, 4, 1, 1, 1, 1, 1,13, 9, 5, + 2, 2, 2, 2, 2, 6,10,14, 3, 3, 3, 3, 3, 7,11,15, + 4,16, 6,18, 4, 0,12, 8, 5,17, 7,19, 5, 1,13, 9, + 6,18, 4,16, 6,10,14, 2, 7,19, 5,17, 7,11,15, 3, + 8,10, 9,11, 8, 4, 0,12, 9,11, 8,10, 9, 5, 1,13, +10, 9,11, 8,10,14, 2, 6,11, 8,10, 9,11,15, 3, 7, +12,22,24,20,12, 8, 4, 0,13,23,25,21,13, 9, 5, 1, +25,21,13,23,14, 2, 6,10,24,20,12,22,15, 3, 7,11, +16, 6,18, 4,16,16,16,16,17, 7,19, 5,17,17,17,17, +18, 4,16, 6,18,22,19,23,19, 5,17, 7,19,23,18,22, +20,12,22,24,20,20,20,20,21,13,23,25,21,21,21,21, +22,24,20,12,22,19,23,18,23,25,21,13,23,18,22,19, +24,20,12,22,15, 3, 7,11,25,21,13,23,14, 2, 6,10, diff --git a/gmp-6.3.0/mpn/lisp/gmpasm-mode.el b/gmp-6.3.0/mpn/lisp/gmpasm-mode.el new file mode 100644 index 0000000..6f2fea0 --- /dev/null +++ b/gmp-6.3.0/mpn/lisp/gmpasm-mode.el @@ -0,0 +1,385 @@ +;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode. + + +;; Copyright 1999-2002 Free Software Foundation, Inc. + +;; This file is part of the GNU MP Library. +;; +;; The GNU MP Library is free software; you can redistribute it and/or modify +;; it under the terms of either: +;; +;; * the GNU Lesser General Public License as published by the Free +;; Software Foundation; either version 3 of the License, or (at your +;; option) any later version. +;; +;; or +;; +;; * the GNU General Public License as published by the Free Software +;; Foundation; either version 2 of the License, or (at your option) any +;; later version. +;; +;; or both in parallel, as here. +;; +;; The GNU MP Library is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +;; for more details. +;; +;; You should have received copies of the GNU General Public License and the +;; GNU Lesser General Public License along with the GNU MP Library. If not, +;; see https://www.gnu.org/licenses/. + + +;;; Commentary: +;; +;; gmpasm-mode is a major mode for editing m4 processed assembler code and +;; m4 macro files in GMP. It's similar to m4-mode, but has a number of +;; settings better suited to GMP. +;; +;; +;; Install +;; ------- +;; +;; To make M-x gmpasm-mode available, put gmpasm-mode.el somewhere in your +;; load-path and the following in your .emacs +;; +;; (autoload 'gmpasm-mode "gmpasm-mode" nil t) +;; +;; To use gmpasm-mode automatically on all .asm and .m4 files, put the +;; following in your .emacs +;; +;; (add-to-list 'auto-mode-alist '("\\.asm\\'" . gmpasm-mode)) +;; (add-to-list 'auto-mode-alist '("\\.m4\\'" . gmpasm-mode)) +;; +;; To have gmpasm-mode only on gmp files, try instead something like the +;; following, which uses it only in a directory starting with "gmp", or a +;; sub-directory of such. +;; +;; (add-to-list 'auto-mode-alist +;; '("/gmp.*/.*\\.\\(asm\\|m4\\)\\'" . gmpasm-mode)) +;; +;; Byte compiling will slightly speed up loading. If you want a docstring +;; in the autoload you can use M-x update-file-autoloads if you set it up +;; right. +;; +;; +;; Emacsen +;; ------- +;; +;; GNU Emacs 20.x, 21.x and XEmacs 20.x all work well. GNU Emacs 19.x +;; should work if replacements for the various 20.x-isms are available, +;; though comment-region with "C" doesn't do the right thing. + + +;;; Code: + +(defgroup gmpasm nil + "GNU MP m4 and asm editing." + :prefix "gmpasm-" + :group 'languages) + +(defcustom gmpasm-mode-hook nil + "*Hook called by `gmpasm-mode'." + :type 'hook + :group 'gmpasm) + +(defcustom gmpasm-comment-start-regexp "\\([#;!@*|C]\\|//\\)" + "*Regexp matching possible comment styles. +See `gmpasm-mode' docstring for how this is used. + +Commenting styles within GMP include + # - alpha, i386, i960, vax, traditional unix + ; - a29k, clipper, hppa, m88k, ppc + ! - sh, sparc, z8000 + | - m68k + @ - arm + * - cray + C - GMP m4, see mpn/asm-defs.m4 + // - ia64" + :type 'regexp + :group 'gmpasm) + + +(defun gmpasm-add-to-list-second (list-var element) + "(gmpasm-add-to-list-second LIST-VAR ELEMENT) + +Add ELEMENT to LIST-VAR as the second element in the list, if it isn't +already in the list. If LIST-VAR is nil, then ELEMENT is just added as the +sole element in the list. + +This is like `add-to-list', but it puts the new value second in the list. + +The first cons cell is copied rather than changed in-place, so references to +the list elsewhere won't be affected." + + (if (member element (symbol-value list-var)) + (symbol-value list-var) + (set list-var + (if (symbol-value list-var) + (cons (car (symbol-value list-var)) + (cons element + (cdr (symbol-value list-var)))) + (list element))))) + + +(defun gmpasm-remove-from-list (list-var element) + "(gmpasm-remove-from-list LIST-VAR ELEMENT) + +Remove ELEMENT from LIST-VAR, using `copy-sequence' and `delete'. +This is vaguely like `add-to-list', but the element is removed from the list. +The list is copied rather than changed in-place, so references to it elsewhere +aren't affected." + +;; Only the portion of the list up to the removed element needs to be +;; copied, but there's no need to bother arranging that, since this function +;; is only used for a couple of initializations. + + (set list-var (delete element (copy-sequence (symbol-value list-var))))) + + +(defvar gmpasm-mode-map + (let ((map (make-sparse-keymap))) + + ;; assembler and dnl commenting + (define-key map "\C-c\C-c" 'comment-region) + (define-key map "\C-c\C-d" 'gmpasm-comment-region-dnl) + + ;; kill an M-x compile, since it's not hard to put m4 into an infinite + ;; loop + (define-key map "\C-c\C-k" 'kill-compilation) + + map) + "Keymap for `gmpasm-mode'.") + + +(defvar gmpasm-mode-syntax-table + (let ((table (make-syntax-table))) + ;; underscore left as a symbol char, like C mode + + ;; m4 quotes + (modify-syntax-entry ?` "('" table) + (modify-syntax-entry ?' ")`" table) + + table) + "Syntax table used in `gmpasm-mode'. + +'#' and '\n' aren't set as comment syntax. In m4 these are a comment +outside quotes, but not inside. Omitting a syntax entry ensures that when +inside quotes emacs treats parentheses and apostrophes the same way that m4 +does. When outside quotes this is not quite right, but having it right when +nesting expressions is more important. + +'*', '!' or '|' aren't setup as comment syntax either, on CPUs which use +these for comments. The GMP macro setups don't set them in m4 changecom(), +since that prevents them being used in eval() expressions, and on that basis +they don't change the way quotes and parentheses are treated by m4 and +should be treated by emacs.") + + +(defvar gmpasm-font-lock-keywords + (eval-when-compile + (list + (cons + (concat + "\\b" + (regexp-opt + '("deflit" "defreg" "defframe" "defframe_pushl" + "define_not_for_expansion" + "m4_error" "m4_warning" + "ASM_START" "ASM_END" + "PROLOGUE" "PROLOGUE_GP" "MULFUNC_PROLOGUE" "EPILOGUE" + "DATASTART" "DATAEND" + "forloop" + "TEXT" "DATA" "ALIGN" "W32" "FLOAT64" + "builtin" "changecom" "changequote" "changeword" "debugfile" + "debugmode" "decr" "define" "defn" "divert" "divnum" "dumpdef" + "errprint" "esyscmd" "eval" "__file__" "format" "gnu" "ifdef" + "ifelse" "include" "incr" "index" "indir" "len" "__line__" + "m4exit" "m4wrap" "maketemp" "patsubst" "popdef" "pushdef" + "regexp" "shift" "sinclude" "substr" "syscmd" "sysval" + "traceoff" "traceon" "translit" "undefine" "undivert" "unix") + t) + "\\b") 'font-lock-keyword-face))) + + "`font-lock-keywords' for `gmpasm-mode'. + +The keywords are m4 builtins and some of the GMP macros used in asm files. +L doesn't look good fontified, so it's omitted. + +The right assembler comment regexp is added dynamically buffer-local (with +dnl too).") + + +;; Initialized if gmpasm-mode finds filladapt loaded. +(defvar gmpasm-filladapt-token-table nil + "Filladapt token table used in `gmpasm-mode'.") +(defvar gmpasm-filladapt-token-match-table nil + "Filladapt token match table used in `gmpasm-mode'.") +(defvar gmpasm-filladapt-token-conversion-table nil + "Filladapt token conversion table used in `gmpasm-mode'.") + + +;;;###autoload +(defun gmpasm-mode () + "A major mode for editing GNU MP asm and m4 files. + +\\{gmpasm-mode-map} +`comment-start' and `comment-end' are set buffer-local to assembler +commenting appropriate for the CPU by looking for something matching +`gmpasm-comment-start-regexp' at the start of a line, or \"#\" is used if +there's no match (if \"#\" isn't what you want, type in a desired comment +and do \\[gmpasm-mode] to reinitialize). + +`adaptive-fill-regexp' is set buffer-local to the standard regexp with +`comment-start' and dnl added. If filladapt.el has been loaded it similarly +gets `comment-start' and dnl added as buffer-local fill prefixes. + +Font locking has the m4 builtins, some of the GMP macros, m4 dnl commenting, +and assembler commenting (based on the `comment-start' determined). + +Note that `gmpasm-comment-start-regexp' is only matched as a whole word, so +the `C' in it is only matched as a whole word, not on something that happens +to start with `C'. Also it's only the particular `comment-start' determined +that's added for filling etc, not the whole `gmpasm-comment-start-regexp'. + +`gmpasm-mode-hook' is run after initializations are complete." + + (interactive) + (kill-all-local-variables) + (setq major-mode 'gmpasm-mode + mode-name "gmpasm") + (use-local-map gmpasm-mode-map) + (set-syntax-table gmpasm-mode-syntax-table) + (setq fill-column 76) + + ;; Short instructions might fit with 32, but anything with labels or + ;; expressions soon needs the comments pushed out to column 40. + (setq comment-column 40) + + ;; Don't want to find out the hard way which dumb assemblers don't like a + ;; missing final newline. + (set (make-local-variable 'require-final-newline) t) + + ;; The first match of gmpasm-comment-start-regexp at the start of a line + ;; determines comment-start, or "#" if no match. + (set (make-local-variable 'comment-start) + (save-excursion + (goto-char (point-min)) + (if (re-search-forward + (concat "^\\(" gmpasm-comment-start-regexp "\\)\\(\\s-\\|$\\)") + nil t) + (match-string 1) + "#"))) + (set (make-local-variable 'comment-end) "") + + ;; If comment-start ends in an alphanumeric then \b is used to match it + ;; only as a separate word. The test is for an alphanumeric rather than + ;; \w since we might try # or ! as \w characters but without wanting \b on + ;; them. + (let ((comment-regexp + (concat (regexp-quote comment-start) + (if (string-match "[a-zA-Z0-9]\\'" comment-start) "\\b")))) + + ;; Whitespace is required before a comment-start so m4 $# doesn't match + ;; when comment-start is "#". + (set (make-local-variable 'comment-start-skip) + (concat "\\(^\\|\\s-\\)\\(\\\\|" comment-regexp "\\)[ \t]*")) + + ;; Comment fontification based on comment-start, and always with dnl. + ;; Same treatment of a space before "#" as in comment-start-skip, but + ;; don't fontify that space. + (add-to-list (make-local-variable 'gmpasm-font-lock-keywords) + (list (concat "\\(^\\|\\s-\\)\\(\\(\\\\|" + comment-regexp + "\\).*$\\)") + 2 'font-lock-comment-face)) + + (set (make-local-variable 'font-lock-defaults) + '(gmpasm-font-lock-keywords + t ; no syntactic fontification (of strings etc) + nil ; no case-fold + ((?_ . "w")) ; _ part of a word while fontifying + )) + + ;; Paragraphs are separated by blank lines, or lines with only dnl or + ;; comment-start. + (set (make-local-variable 'paragraph-separate) + (concat "[ \t\f]*\\(\\(" comment-regexp "\\|dnl\\)[ \t]*\\)*$")) + (set (make-local-variable 'paragraph-start) + (concat "\f\\|" paragraph-separate)) + + ;; Some sort of "def...(" m4 define, possibly with ` for quoting. + ;; Could do something with PROLOGUE here, but in GMP the filename is + ;; enough, it's not normally necessary to say the function name. + (set (make-local-variable 'add-log-current-defun-header-regexp) + "^def[a-z0-9_]+(`?\\([a-zA-Z0-9_]+\\)") + + ;; Adaptive fill gets dnl and comment-start as comment style prefixes on + ;; top of the standard regexp (which has # and ; already actually). + (set (make-local-variable 'adaptive-fill-regexp) + (concat "[ \t]*\\(\\(" + comment-regexp + "\\|dnl\\|[-|#;>*]+\\|(?[0-9]+[.)]\\)[ \t]*\\)*")) + (set (make-local-variable 'adaptive-fill-first-line-regexp) + "\\`\\([ \t]*dnl\\)?[ \t]*\\'") + + (when (fboundp 'filladapt-mode) + (unless gmpasm-filladapt-token-table + (setq gmpasm-filladapt-token-table + filladapt-token-table) + (setq gmpasm-filladapt-token-match-table + filladapt-token-match-table) + (setq gmpasm-filladapt-token-conversion-table + filladapt-token-conversion-table) + + ;; Numbered bullet points like "2.1" get matched at the start of a + ;; line when it's really something like "2.1 cycles/limb", so remove + ;; this from the list. The regexp for "1.", "2." etc is left + ;; though. + (gmpasm-remove-from-list 'gmpasm-filladapt-token-table + '("[0-9]+\\(\\.[0-9]+\\)+[ \t]" + bullet)) + + ;; "%" as a comment prefix interferes with register names on some + ;; CPUs, like %eax on x86, so remove this. + (gmpasm-remove-from-list 'gmpasm-filladapt-token-table + '("%+" postscript-comment)) + + (add-to-list 'gmpasm-filladapt-token-match-table + '(gmpasm-comment gmpasm-comment)) + (add-to-list 'gmpasm-filladapt-token-conversion-table + '(gmpasm-comment . exact))) + + (set (make-local-variable 'filladapt-token-table) + gmpasm-filladapt-token-table) + (set (make-local-variable 'filladapt-token-match-table) + gmpasm-filladapt-token-match-table) + (set (make-local-variable 'filladapt-token-conversion-table) + gmpasm-filladapt-token-conversion-table) + + ;; Add dnl and comment-start as fill prefixes. + ;; Comments in filladapt.el say filladapt-token-table must begin + ;; with ("^" beginning-of-line), so put our addition second. + (gmpasm-add-to-list-second 'filladapt-token-table + (list (concat "dnl[ \t]\\|" comment-regexp) + 'gmpasm-comment)))) + + (run-hooks 'gmpasm-mode-hook)) + + +(defun gmpasm-comment-region-dnl (beg end &optional arg) + "(gmpasm-comment-region-dnl BEG END &optional ARG) + +Comment or uncomment each line in the region using `dnl'. +With \\[universal-argument] prefix arg, uncomment each line in region. +This is `comment-region', but using \"dnl\"." + + (interactive "r\nP") + (let ((comment-start "dnl") + (comment-end "")) + (comment-region beg end arg))) + + +(provide 'gmpasm-mode) + +;;; gmpasm-mode.el ends here diff --git a/gmp-6.3.0/mpn/loongarch/64/add_n.asm b/gmp-6.3.0/mpn/loongarch/64/add_n.asm new file mode 100644 index 0000000..e0832a0 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/add_n.asm @@ -0,0 +1,64 @@ +dnl Loongarch mpn_add_n + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp_arg',`$r4') +define(`ap', `$r5') +define(`bp', `$r6') +define(`n', `$r7') + +define(`rp', `$r8') + +ASM_START() +PROLOGUE(mpn_add_n) + alsl.d rp, n, rp_arg, 3 + alsl.d ap, n, ap, 3 + alsl.d bp, n, bp, 3 + sub.d n, $r0, n + slli.d n, n, 3 + or $r4, $r0, $r0 + +L(top): ldx.d $r14, ap, n + ldx.d $r13, bp, n + add.d $r12, $r14, $r13 + sltu $r15, $r12, $r13 C cy0 + add.d $r14, $r12, $r4 + sltu $r16, $r14, $r4 C cy1 set iff r4=1 & r12=111...1 + stx.d $r14, rp, n + addi.d n, n, 8 + or $r4, $r15, $r16 + bnez n, L(top) + + jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/loongarch/64/aorslsh1_n.asm b/gmp-6.3.0/mpn/loongarch/64/aorslsh1_n.asm new file mode 100644 index 0000000..ea70b13 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/aorslsh1_n.asm @@ -0,0 +1,50 @@ +dnl Loongarch mpn_addlsh1_n, mpn_sublsh1_n. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n',` + define(`ADDSUB', `add.d') + define(`CARRY', `sltu $1,$2,$3') + define(`func', `mpn_addlsh1_n') +') +ifdef(`OPERATION_sublsh1_n',` + define(`ADDSUB', `sub.d') + define(`CARRY', `sltu $1,$3,$2') + define(`func', `mpn_sublsh1_n') +') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) +include_mpn(`loongarch/64/aorslshC_n.asm') diff --git a/gmp-6.3.0/mpn/loongarch/64/aorslsh2_n.asm b/gmp-6.3.0/mpn/loongarch/64/aorslsh2_n.asm new file mode 100644 index 0000000..6f03d06 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/aorslsh2_n.asm @@ -0,0 +1,50 @@ +dnl Loongarch mpn_addlsh1_n, mpn_sublsh1_n. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',` + define(`ADDSUB', `add.d') + define(`CARRY', `sltu $1,$2,$3') + define(`func', `mpn_addlsh2_n') +') +ifdef(`OPERATION_sublsh2_n',` + define(`ADDSUB', `sub.d') + define(`CARRY', `sltu $1,$3,$2') + define(`func', `mpn_sublsh2_n') +') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n) +include_mpn(`loongarch/64/aorslshC_n.asm') diff --git a/gmp-6.3.0/mpn/loongarch/64/aorslshC_n.asm b/gmp-6.3.0/mpn/loongarch/64/aorslshC_n.asm new file mode 100644 index 0000000..dd34188 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/aorslshC_n.asm @@ -0,0 +1,116 @@ +dnl Loongarch mpn_addlshC_n/mpn_sublshC_n + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(`rp',`$a0') +define(`ap',`$a1') +define(`bp',`$a2') +define(`n', `$a3') + +define(`i', `$a4') + + +ASM_START() +PROLOGUE(func) + srli.d i, n, 2 + move $a5, $zero + move $t7, $zero + + andi $t0, n, 1 + andi $t1, n, 2 + bnez $t0, L(bx1) +L(bx0): beqz $t1, L(b0) +L(b10): addi.d bp, bp, -16 + addi.d ap, ap, -16 + addi.d rp, rp, -16 + b L(b2) +L(bx1): bnez $t1, L(b11) +L(b01): addi.d bp, bp, -24 + addi.d ap, ap, -24 + addi.d rp, rp, -24 + b L(b1) +L(b11): addi.d bp, bp, -8 + addi.d ap, ap, -8 + addi.d rp, rp, -8 + b L(b3) + +L(top): addi.d bp, bp, 32 + addi.d ap, ap, 32 + addi.d rp, rp, 32 +L(b0): addi.d i, i, -1 + ld.d $t0, bp, 0 + alsl.d $t6, $t0, $t7, LSH + ld.d $t2, ap, 0 + ADDSUB $t4, $t2, $t6 + CARRY( $a6, $t4, $t2, $t6) + srli.d $t7, $t0, RSH + ADDSUB $t5, $t4, $a5 + CARRY( $a5, $t5, $t4, $a5) + st.d $t5, rp, 0 + or $a5, $a5, $a6 +L(b3): ld.d $t0, bp, 8 + alsl.d $t6, $t0, $t7, LSH + ld.d $t2, ap, 8 + ADDSUB $t4, $t2, $t6 + CARRY( $a6, $t4, $t2, $t6) + srli.d $t7, $t0, RSH + ADDSUB $t5, $t4, $a5 + CARRY( $a5, $t5, $t4, $a5) + st.d $t5, rp, 8 + or $a5, $a5, $a6 +L(b2): ld.d $t0, bp, 16 + alsl.d $t6, $t0, $t7, LSH + ld.d $t2, ap, 16 + ADDSUB $t4, $t2, $t6 + CARRY( $a6, $t4, $t2, $t6) + srli.d $t7, $t0, RSH + ADDSUB $t5, $t4, $a5 + CARRY( $a5, $t5, $t4, $a5) + st.d $t5, rp, 16 + or $a5, $a5, $a6 +L(b1): ld.d $t0, bp, 24 + alsl.d $t6, $t0, $t7, LSH + ld.d $t2, ap, 24 + ADDSUB $t4, $t2, $t6 + CARRY( $a6, $t4, $t2, $t6) + srli.d $t7, $t0, RSH + ADDSUB $t5, $t4, $a5 + CARRY( $a5, $t5, $t4, $a5) + st.d $t5, rp, 24 + or $a5, $a5, $a6 + bnez i, L(top) + +L(end): add.d $a0, $a5, $t7 + jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/loongarch/64/aorsmul_1.asm b/gmp-6.3.0/mpn/loongarch/64/aorsmul_1.asm new file mode 100644 index 0000000..49de51d --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/aorsmul_1.asm @@ -0,0 +1,120 @@ +dnl Loongarch mpn_addmul_1 and mpn_submul_1 + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `$a0') +define(`ap', `$a1') +define(`n', `$a2') +define(`b0', `$a3') + +define(`cy', `$a4') +define(`i', `$a5') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add.d') + define(`CMPCY', `sltu $1, $2, $3') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub.d') + define(`CMPCY', `sltu $1, $3, $2') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1) + +define(`BLOCK', ` + mul.d $t1, $t2, b0 + mulh.du $t0, $t2, b0 + ld.d $t2, ap, $1 + ADDSUB $t5, $t3, $t1 + CMPCY( $t4, $t5, $t3) + ld.d $t3, rp, $1 + ADDSUB $t6, $t5, cy + add.d $t4, $t4, $t0 + CMPCY( $t5, $t6, $t5) + st.d $t6, rp, eval($1-8) + add.d cy, $t4, $t5') + +ASM_START() + +ifdef(`OPERATION_addmul_1', ` +PROLOGUE(mpn_addmul_1c) + srli.d i, n, 2 + b L(ent) +EPILOGUE() +') + +PROLOGUE(func) + srli.d i, n, 2 + or cy, $r0, $r0 +L(ent): ld.d $t2, ap, 0 + ld.d $t3, rp, 0 + + andi $t0, n, 1 + andi $t1, n, 2 + bnez $t0, L(bx1) +L(bx0): beqz $t1, L(b0) +L(b10): addi.d ap, ap, -16 + addi.d rp, rp, -16 + b L(b2) +L(bx1): beqz $t1, L(b01) +L(b11): addi.d ap, ap, -8 + addi.d rp, rp, -8 + b L(b3) +L(b01): addi.d ap, ap, 8 + addi.d rp, rp, 8 + beqz i, L(end) + +L(top): +L(b1): BLOCK(0) +L(b0): BLOCK(8) + addi.d i, i, -1 +L(b3): BLOCK(16) +L(b2): BLOCK(24) + addi.d ap, ap, 32 + addi.d rp, rp, 32 + bnez i, L(top) + +L(end): mul.d $t1, $t2, b0 + mulh.du $t0, $t2, b0 + ADDSUB $t5, $t3, $t1 + CMPCY( $t4, $t5, $t3) + ADDSUB $t6, $t5, cy + add.d $t4, $t4, $t0 + CMPCY( $t5, $t6, $t5) + st.d $t6, rp, -8 + add.d $a0, $t4, $t5 + jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/loongarch/64/cnd_aors_n.asm b/gmp-6.3.0/mpn/loongarch/64/cnd_aors_n.asm new file mode 100644 index 0000000..deff3d3 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/cnd_aors_n.asm @@ -0,0 +1,99 @@ +dnl Loongarch mpn_cnd_add_n and mpn_cnd_sub_n. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2016, 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`cnd', `$a0') +define(`rp', `$a1') +define(`up', `$a2') +define(`vp', `$a3') +define(`n', `$a4') + +define(`mask', `$t5') + +ifdef(`OPERATION_cnd_add_n',` + define(`ADDSUB', `add.d') + define(`CMPCY', `sltu $1, $2, $3') + define(`func', `mpn_cnd_add_n') +') +ifdef(`OPERATION_cnd_sub_n',` + define(`ADDSUB', `sub.d') + define(`CMPCY', `sltu $1, $3, $4') + define(`func', `mpn_cnd_sub_n') +') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + move $t6, $zero + + sltui mask, cnd, 1 + addi.d mask, mask, -1 + + andi $t0, n, 1 + beqz $t0, L(top) + addi.d up, up, 8 + addi.d vp, vp, -8 + addi.d rp, rp, -8 + addi.d n, n, -1 + b L(mid) + +L(top): ld.d $a7, vp, 0 + ld.d $a5, up, 0 + addi.d n, n, -2 C bookkeeping + addi.d up, up, 16 C bookkeeping + and $a7, $a7, mask + ADDSUB $t0, $a5, $a7 + CMPCY( $t2, $t0, $a5, $a7) + ADDSUB $t4, $t0, $t6 C cycle 3, 9, ... + CMPCY( $t3, $t4, $t0, $t6) C cycle 4, 10, ... + st.d $t4, rp, 0 + add.d $t6, $t2, $t3 C cycle 5, 11, ... +L(mid): ld.d $a7, vp, 8 + ld.d $a5, up, -8 + addi.d vp, vp, 16 C bookkeeping + addi.d rp, rp, 16 C bookkeeping + and $a7, $a7, mask + ADDSUB $t1, $a5, $a7 + CMPCY( $t2, $t1, $a5, $a7) + ADDSUB $t4, $t1, $t6 C cycle 0, 6, ... + CMPCY( $t3, $t4, $t1, $t6) C cycle 1, 7, ... + st.d $t4, rp, -8 + add.d $t6, $t2, $t3 C cycle 2, 8, ... + bnez n, L(top) + +L(end): move $a0, $t6 + jr $r1 +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/loongarch/64/copyd.asm b/gmp-6.3.0/mpn/loongarch/64/copyd.asm new file mode 100644 index 0000000..1d10b28 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/copyd.asm @@ -0,0 +1,75 @@ +dnl Loongarch mpn_copyd + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `$a0') +define(`ap', `$a1') +define(`n', `$a2') + +define(`i', `$a3') + +ASM_START() +PROLOGUE(mpn_copyd) + alsl.d ap, n, ap, 3 + alsl.d rp, n, rp, 3 + srli.d i, n, 2 + beqz i, L(end) + +L(top): addi.d i, i, -1 + ld.d $t0, ap, -8 + st.d $t0, rp, -8 + ld.d $t1, ap, -16 + st.d $t1, rp, -16 + ld.d $t2, ap, -24 + st.d $t2, rp, -24 + ld.d $t3, ap, -32 + st.d $t3, rp, -32 + addi.d ap, ap, -32 + addi.d rp, rp, -32 + bnez i, L(top) + +L(end): andi $t1, n, 2 + beqz $t1, L(b0x) + ld.d $t0, ap, -8 + st.d $t0, rp, -8 + ld.d $t1, ap, -16 + st.d $t1, rp, -16 + addi.d ap, ap, -16 + addi.d rp, rp, -16 +L(b0x): andi $t0, n, 1 + beqz $t0, L(bx0) + ld.d $t0, ap, -8 + st.d $t0, rp, -8 +L(bx0): jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/loongarch/64/copyi.asm b/gmp-6.3.0/mpn/loongarch/64/copyi.asm new file mode 100644 index 0000000..a52401c --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/copyi.asm @@ -0,0 +1,73 @@ +dnl Loongarch mpn_copyi + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `$a0') +define(`ap', `$a1') +define(`n', `$a2') + +define(`i', `$a3') + +ASM_START() +PROLOGUE(mpn_copyi) + srli.d i, n, 2 + beqz i, L(end) + +L(top): addi.d i, i, -1 + ld.d $t0, ap, 0 + st.d $t0, rp, 0 + ld.d $t1, ap, 8 + st.d $t1, rp, 8 + ld.d $t2, ap, 16 + st.d $t2, rp, 16 + ld.d $t3, ap, 24 + st.d $t3, rp, 24 + addi.d ap, ap, 32 + addi.d rp, rp, 32 + bnez i, L(top) + +L(end): andi $t1, n, 2 + beqz $t1, L(b0x) + ld.d $t0, ap, 0 + st.d $t0, rp, 0 + ld.d $t1, ap, 8 + st.d $t1, rp, 8 + addi.d ap, ap, 16 + addi.d rp, rp, 16 +L(b0x): andi $t0, n, 1 + beqz $t0, L(bx0) + ld.d $t0, ap, 0 + st.d $t0, rp, 0 +L(bx0): jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/loongarch/64/lshift.asm b/gmp-6.3.0/mpn/loongarch/64/lshift.asm new file mode 100644 index 0000000..71eef3b --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/lshift.asm @@ -0,0 +1,120 @@ +dnl Loongarch mpn_lshift + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp_arg',`$a0') +define(`ap', `$a1') +define(`n', `$a2') +define(`cnt', `$a3') + +define(`rp', `$a4') +define(`tnc', `$t8') +define(`i', `$a7') + +ASM_START() +PROLOGUE(mpn_lshift) + alsl.d ap, n, ap, 3 + alsl.d rp, n, rp_arg, 3 + sub.d tnc, $zero, cnt + srli.d i, n, 2 + + ld.d $t0, ap, -8 + srl.d $a0, $t0, tnc + + andi $t6, n, 1 + andi $t7, n, 2 + bnez $t6, L(bx1) + + sll.d $t3, $t0, cnt + ld.d $t0, ap, -16 + addi.d i, i, -1 + bnez $t7, L(b10) + addi.d rp, rp, 16 + b L(b0) +L(b10): addi.d ap, ap, -16 + bge i, $zero, L(b2) +L(eq2): srl.d $t4, $t0, tnc + sll.d $t2, $t0, cnt + or $t4, $t3, $t4 + st.d $t4, rp, -8 + st.d $t2, rp, -16 + jr $r1 + +L(bx1): sll.d $t2, $t0, cnt + bnez $t7, L(b11) + bnez i, L(gt1) + st.d $t2, rp, -8 + jr $r1 +L(gt1): ld.d $t0, ap, -16 + addi.d ap, ap, -8 + addi.d rp, rp, 8 + addi.d i, i, -1 + b L(b1) +L(b11): ld.d $t0, ap, -16 + addi.d ap, ap, 8 + addi.d rp, rp, 24 + b L(b3) + +L(top): addi.d ap, ap, -32 + addi.d rp, rp, -32 + addi.d i, i, -1 +L(b2): srl.d $t4, $t0, tnc + sll.d $t2, $t0, cnt + ld.d $t0, ap, -8 + or $t4, $t3, $t4 + st.d $t4, rp, -8 +L(b1): srl.d $t4, $t0, tnc + sll.d $t3, $t0, cnt + ld.d $t0, ap, -16 + or $t4, $t2, $t4 + st.d $t4, rp, -16 +L(b0): srl.d $t4, $t0, tnc + sll.d $t2, $t0, cnt + ld.d $t0, ap, -24 + or $t4, $t3, $t4 + st.d $t4, rp, -24 +L(b3): srl.d $t4, $t0, tnc + sll.d $t3, $t0, cnt + ld.d $t0, ap, -32 + or $t4, $t2, $t4 + st.d $t4, rp, -32 + bnez i, L(top) + +L(end): srl.d $t4, $t0, tnc + sll.d $t2, $t0, cnt + or $t4, $t3, $t4 + st.d $t4, rp, -40 + st.d $t2, rp, -48 + jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/loongarch/64/mul_1.asm b/gmp-6.3.0/mpn/loongarch/64/mul_1.asm new file mode 100644 index 0000000..8f84709 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/mul_1.asm @@ -0,0 +1,97 @@ +dnl Loongarch mpn_mul_1 + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `$a0') +define(`ap', `$a1') +define(`n', `$a2') +define(`b0', `$a3') + +define(`cy', `$a4') +define(`i', `$a5') + +define(`BLOCK', ` + mul.d $t1, $t2, b0 + mulh.du $t0, $t2, b0 + ld.d $t2, ap, $1 + add.d $t6, $t1, cy + sltu $t5, $t6, $t1 + st.d $t6, rp, eval($1-8) + add.d cy, $t0, $t5') + +ASM_START() + +PROLOGUE(mpn_mul_1c) + srli.d i, n, 2 + b L(ent) +EPILOGUE() + +PROLOGUE(mpn_mul_1) + srli.d i, n, 2 + or cy, $r0, $r0 +L(ent): ld.d $t2, ap, 0 + + andi $t0, n, 1 + andi $t1, n, 2 + bnez $t0, L(bx1) +L(bx0): beqz $t1, L(b0) +L(b10): addi.d ap, ap, -16 + addi.d rp, rp, -16 + b L(b2) +L(bx1): beqz $t1, L(b01) +L(b11): addi.d ap, ap, -8 + addi.d rp, rp, -8 + b L(b3) +L(b01): addi.d ap, ap, 8 + addi.d rp, rp, 8 + beqz i, L(end) + +L(top): +L(b1): BLOCK(0) +L(b0): BLOCK(8) + addi.d i, i, -1 +L(b3): BLOCK(16) +L(b2): BLOCK(24) + addi.d ap, ap, 32 + addi.d rp, rp, 32 + bnez i, L(top) + +L(end): mul.d $t1, $t2, b0 + mulh.du $t0, $t2, b0 + add.d $t6, $t1, cy + sltu $t5, $t6, $t1 + st.d $t6, rp, -8 + add.d $a0, $t0, $t5 + jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/loongarch/64/rshift.asm b/gmp-6.3.0/mpn/loongarch/64/rshift.asm new file mode 100644 index 0000000..a183576 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/rshift.asm @@ -0,0 +1,119 @@ +dnl Loongarch mpn_rshift + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp_arg',`$a0') +define(`ap', `$a1') +define(`n', `$a2') +define(`cnt', `$a3') + +define(`rp', `$a4') +define(`tnc', `$t8') +define(`i', `$a7') + +ASM_START() +PROLOGUE(mpn_rshift) + move rp, rp_arg + sub.d tnc, $zero, cnt + srli.d i, n, 2 + + ld.d $t0, ap, 0 + sll.d $a0, $t0, tnc + + andi $t6, n, 1 + andi $t7, n, 2 + bnez $t6, L(bx1) + + srl.d $t3, $t0, cnt + ld.d $t0, ap, 8 + addi.d i, i, -1 + bnez $t7, L(b10) + addi.d rp, rp, -16 + b L(b0) +L(b10): addi.d ap, ap, 16 + bge i, $zero, L(b2) +L(eq2): sll.d $t4, $t0, tnc + srl.d $t2, $t0, cnt + or $t4, $t3, $t4 + st.d $t4, rp, 0 + st.d $t2, rp, 8 + jr $r1 + +L(bx1): srl.d $t2, $t0, cnt + bnez $t7, L(b11) + bnez i, L(gt1) + st.d $t2, rp, 0 + jr $r1 +L(gt1): ld.d $t0, ap, 8 + addi.d ap, ap, 8 + addi.d rp, rp, -8 + addi.d i, i, -1 + b L(b1) +L(b11): ld.d $t0, ap, 8 + addi.d ap, ap, -8 + addi.d rp, rp, -24 + b L(b3) + +L(top): addi.d ap, ap, 32 + addi.d rp, rp, 32 + addi.d i, i, -1 +L(b2): sll.d $t4, $t0, tnc + srl.d $t2, $t0, cnt + ld.d $t0, ap, 0 + or $t4, $t3, $t4 + st.d $t4, rp, 0 +L(b1): sll.d $t4, $t0, tnc + srl.d $t3, $t0, cnt + ld.d $t0, ap, 8 + or $t4, $t2, $t4 + st.d $t4, rp, 8 +L(b0): sll.d $t4, $t0, tnc + srl.d $t2, $t0, cnt + ld.d $t0, ap, 16 + or $t4, $t3, $t4 + st.d $t4, rp, 16 +L(b3): sll.d $t4, $t0, tnc + srl.d $t3, $t0, cnt + ld.d $t0, ap, 24 + or $t4, $t2, $t4 + st.d $t4, rp, 24 + bnez i, L(top) + +L(end): sll.d $t4, $t0, tnc + srl.d $t2, $t0, cnt + or $t4, $t3, $t4 + st.d $t4, rp, 32 + st.d $t2, rp, 40 + jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/loongarch/64/sub_n.asm b/gmp-6.3.0/mpn/loongarch/64/sub_n.asm new file mode 100644 index 0000000..50821d9 --- /dev/null +++ b/gmp-6.3.0/mpn/loongarch/64/sub_n.asm @@ -0,0 +1,106 @@ +dnl Loongarch mpn_sub_n + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `$a0') +define(`ap', `$a1') +define(`bp', `$a2') +define(`n', `$a3') + +define(`i', `$a7') + +ASM_START() +PROLOGUE(mpn_sub_n) + srli.d i, n, 2 + move $t8, $zero + + andi $t0, n, 1 + andi $t1, n, 2 + bnez $t0, L(bx1) +L(bx0): beqz $t1, L(b0) +L(b10): addi.d bp, bp, -16 + addi.d ap, ap, -16 + addi.d rp, rp, -16 + b L(b2) +L(bx1): bnez $t1, L(b11) +L(b01): addi.d bp, bp, -24 + addi.d ap, ap, -24 + addi.d rp, rp, -24 + b L(b1) +L(b11): addi.d bp, bp, -8 + addi.d ap, ap, -8 + addi.d rp, rp, -8 + b L(b3) + +L(top): addi.d bp, bp, 32 + addi.d ap, ap, 32 + addi.d rp, rp, 32 +L(b0): addi.d i, i, -1 + ld.d $t4, bp, 0 + ld.d $t0, ap, 0 + sltu $a4, $t0, $t4 + sub.d $t0, $t0, $t4 + sltu $a5, $t0, $t8 C 0 + sub.d $t0, $t0, $t8 C 0 + or $t8, $a4, $a5 C 1 + st.d $t0, rp, 0 +L(b3): ld.d $t5, bp, 8 + ld.d $t1, ap, 8 + sltu $a4, $t1, $t5 + sub.d $t1, $t1, $t5 + sltu $a5, $t1, $t8 C 2 + sub.d $t1, $t1, $t8 C 2 + or $t8, $a4, $a5 C 3 + st.d $t1, rp, 8 +L(b2): ld.d $t4, bp, 16 + ld.d $t0, ap, 16 + sltu $a4, $t0, $t4 + sub.d $t0, $t0, $t4 + sltu $a5, $t0, $t8 C 4 + sub.d $t0, $t0, $t8 C 4 + or $t8, $a4, $a5 C 5 + st.d $t0, rp, 16 +L(b1): ld.d $t5, bp, 24 + ld.d $t1, ap, 24 + sltu $a4, $t1, $t5 + sub.d $t1, $t1, $t5 + sltu $a5, $t1, $t8 C 6 + sub.d $t1, $t1, $t8 C 6 + or $t8, $a4, $a5 C 7 + st.d $t1, rp, 24 + bnez i, L(top) + + move $a0, $t8 + jr $r1 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/lshift.asm b/gmp-6.3.0/mpn/lshift.asm new file mode 120000 index 0000000..0b79b3c --- /dev/null +++ b/gmp-6.3.0/mpn/lshift.asm @@ -0,0 +1 @@ +../mpn/x86/p6/mmx/lshift.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/lshiftc.c b/gmp-6.3.0/mpn/lshiftc.c new file mode 120000 index 0000000..8ed4a6d --- /dev/null +++ b/gmp-6.3.0/mpn/lshiftc.c @@ -0,0 +1 @@ +../mpn/generic/lshiftc.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/m4-ccas b/gmp-6.3.0/mpn/m4-ccas new file mode 100755 index 0000000..16d80c6 --- /dev/null +++ b/gmp-6.3.0/mpn/m4-ccas @@ -0,0 +1,107 @@ +#!/bin/sh +# +# A helper script for Makeasm.am .asm.lo rule. + +# Copyright 2001 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: m4-ccas --m4=M4 CC ... file.asm ... +# +# Process file.asm with the given M4 plus any -D arguments, then +# assemble with the given CC plus all arguments. +# +# The M4 command must be in a single --m4= argument, and will be split +# on whitespace. When CC is invoked file.asm is replaced with a +# temporary .s file which is the M4 output. +# +# To allow parallel builds, the temp file name is based on the .asm +# file name, which will be the output object filename for all uses we +# put this script to. + +M4= +CC= +DEFS= +ASM= +SEEN_O=no + +for i in "$@"; do + case $i in + --m4=*) + M4=`echo "$i" | sed 's/^--m4=//'` + ;; + -D*) + DEFS="$DEFS $i" + CC="$CC $i" + ;; + *.asm) + if test -n "$ASM"; then + echo "Only one .asm file permitted" + exit 1 + fi + BASENAME=`echo "$i" | sed -e 's/\.asm$//' -e 's/^.*[\\/:]//'` + TMP=tmp-$BASENAME.s + ASM=$i + CC="$CC $TMP" + ;; + -o) + SEEN_O=yes + CC="$CC $i" + ;; + *) + CC="$CC $i" + ;; + esac +done + +if test -z "$M4"; then + echo "No --m4 specified" + exit 1 +fi + +if test -z "$ASM"; then + echo "No .asm specified" + exit 1 +fi + +# Libtool adds it's own -o when sending output to .libs/foo.o, but not +# when just wanting foo.o in the current directory. We need an +# explicit -o in both cases since we're assembling tmp-foo.s. +# +if test $SEEN_O = no; then + CC="$CC -o $BASENAME.o" +fi + +echo "$M4 $DEFS $ASM >$TMP" +$M4 $DEFS $ASM >$TMP || exit + +echo "$CC" +$CC || exit + +# Comment this out to preserve .s intermediates +rm -f $TMP diff --git a/gmp-6.3.0/mpn/m68k/README b/gmp-6.3.0/mpn/m68k/README new file mode 100644 index 0000000..5261564 --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/README @@ -0,0 +1,138 @@ +Copyright 2001, 2003, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + M68K MPN SUBROUTINES + + +This directory contains mpn functions for various m68k family chips. + + +CODE ORGANIZATION + + m68k m68000, m68010, m68060 + m68k/mc68020 m68020, m68030, m68040, and CPU32 + + +The m5200 "coldfire", which is m68000 less a few instructions, currently has +no assembler code support. + + +STATUS + +The code herein is old and poorly maintained. If somebody really cared, it +could be optimized substantially. For example, + +* mpn_add_n and mpn_sub_n could, with more unrolling be improved from 6 to + close to 4 c/l (on m68040). + +* The multiplication loops could be sped up by using the FPU. + +* mpn_lshift by 31 should use the special-case mpn_rshift by 1 code, and + vice versa mpn_rshift by 31 use the special lshift by 1, when operand + overlap permits. + +* On 68000, mpn_mul_1, mpn_addmul_1 and mpn_submul_1 could check for a + 16-bit multiplier and use two multiplies per limb, not four. + + Similarly various other _1 operations like mpn_mod_1, mpn_divrem_1, + mpn_divexact_1, mpn_modexact_1c_odd. + +* On 68000, mpn_lshift and mpn_rshift could use a roll and mask instead of + lsrl and lsll. This promises to be a speedup, effectively trading a 6+2*n + shift for one or two 4 cycle masks. Suggested by Jean-Charles Meyrignac. + +* config.guess detects 68000, 68010, CPU32 and 68020 by running some code, + but relies on system information for 030, 040 and 060. Can they be + identified by running some code? Currently this only makes a difference + to the compiler options selected, since we have no specific asm code for + those chips. + +One novel idea for 68000 would be to use a 16-bit limb instead of 32-bits. +This would suit the native 16x16 multiply, but might make it difficult to +get full value from the native 32x32 add/sub/etc. This would be an ABI +option, and would select "__GMP_SHORT_LIMB" in gmp.h. + +Naturally an entirely new set of asm subroutines would be needed for a +16-bit limb. Also there's various places in the C code assuming limb>=long, +which would need to be updated, eg. mpz_set_ui. Some of the nails changes +may have helped cover some of this. + + +ASM FILES + +The .asm files are put through m4 for macro processing, and with the help of +configure give either MIT or Motorola syntax. The generic mpn/asm-defs.m4 +is used, together with mpn/m68k/m68k-defs.m4. See comments in those files. + +Not all possible syntax variations are covered. GCC config/m68k for +instance has things like $ for immediates on CRDS or reversed cmp order for +AT&T SGS. These could probably be handled if anyone really needs it. + + +CALLING CONVENTIONS + +The SVR4 standard has an int of 32 bits, and all parameters 32-bit aligned +on the stack. + +PalmOS and perhaps various embedded systems intended for 68000 however use +an int of 16 bits and parameters only 16-bit aligned on the stack. This is +generated by "gcc -mshort" (and is the default for the PalmOS gcc port, we +believe). + +The asm files adapt to these two ABIs by checking sizeof(unsigned), coming +through config.m4 as SIZEOF_UNSIGNED. Only mpn_lshift and mpn_rshift are +affected, all other routines take longs and pointers, which are 32-bits in +both cases. + +Strictly speaking the size of an int doesn't determine the stack padding +convention. But if int is 16 bits then we can definitely say the host +system is not SVR4, and therefore may as well assume we're in 16-bit stack +alignment. + + +REFERENCES + +"Motorola M68000 Family Programmer's Reference Manual", available online, + + http://e-www.motorola.com/brdata/PDFDB/docs/M68000PM.pdf + +"System V Application Binary Interface: Motorola 68000 Processor Family +Supplement", AT&T, 1990, ISBN 0-13-877553-6. Has details of calling +conventions and ELF style PIC coding. + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/m68k/aors_n.asm b/gmp-6.3.0/mpn/m68k/aors_n.asm new file mode 100644 index 0000000..f7d379e --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/aors_n.asm @@ -0,0 +1,99 @@ +dnl mc68020 mpn_add_n, mpn_sub_n -- add or subtract limb vectors + +dnl Copyright 1992, 1994, 1996, 1999-2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 68040: 6 + +ifdef(`OPERATION_add_n',` + define(M4_inst, addxl) + define(M4_function_n, mpn_add_n) +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, subxl) + define(M4_function_n, mpn_sub_n) +', +`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) + + +C INPUT PARAMETERS +C res_ptr (sp + 4) +C s1_ptr (sp + 8) +C s2_ptr (sp + 12) +C size (sp + 16) + + +PROLOGUE(M4_function_n) + +C Save used registers on the stack. + movel d2, M(-,sp) + movel a2, M(-,sp) + +C Copy the arguments to registers. Better use movem? + movel M(sp,12), a2 + movel M(sp,16), a0 + movel M(sp,20), a1 + movel M(sp,24), d2 + + eorw #1, d2 + lsrl #1, d2 + bcc L(L1) + subql #1, d2 C clears cy as side effect + +L(Loop): + movel M(a0,+), d0 + movel M(a1,+), d1 + M4_inst d1, d0 + movel d0, M(a2,+) +L(L1): movel M(a0,+), d0 + movel M(a1,+), d1 + M4_inst d1, d0 + movel d0, M(a2,+) + + dbf d2, L(Loop) C loop until 16 lsb of %4 == -1 + subxl d0, d0 C d0 <= -cy; save cy as 0 or -1 in d0 + subl #0x10000, d2 + bcs L(L2) + addl d0, d0 C restore cy + bra L(Loop) + +L(L2): + negl d0 + +C Restore used registers from stack frame. + movel M(sp,+), a2 + movel M(sp,+), d2 + + rts + +EPILOGUE(M4_function_n) diff --git a/gmp-6.3.0/mpn/m68k/gmp-mparam.h b/gmp-6.3.0/mpn/m68k/gmp-mparam.h new file mode 100644 index 0000000..9ac7b41 --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/gmp-mparam.h @@ -0,0 +1,76 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2000-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* 25MHz 68040 */ + +/* Generated by tuneup.c, 2004-02-05, gcc 3.2 */ + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 90 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 98 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_DC_THRESHOLD 55 +#define POWM_THRESHOLD 65 + +#define HGCD_THRESHOLD 116 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 590 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define USE_PREINV_DIVREM_1 0 +#define USE_PREINV_MOD_1 0 +#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MODEXACT_1_ODD_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define GET_STR_DC_THRESHOLD 18 +#define GET_STR_PRECOMPUTE_THRESHOLD 43 +#define SET_STR_THRESHOLD 937 + +#define MUL_FFT_TABLE { 336, 672, 1408, 3584, 10240, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 296 +#define MUL_FFT_THRESHOLD 1728 + +#define SQR_FFT_TABLE { 336, 736, 1408, 3584, 10240, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 296 +#define SQR_FFT_THRESHOLD 2304 diff --git a/gmp-6.3.0/mpn/m68k/lshift.asm b/gmp-6.3.0/mpn/m68k/lshift.asm new file mode 100644 index 0000000..f202abf --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/lshift.asm @@ -0,0 +1,175 @@ +dnl mc68020 mpn_lshift -- mpn left shift. + +dnl Copyright 1996, 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C shift==1 shift>1 +C 68040: 5 12 + + +C mp_limb_t mpn_lshift (mp_ptr res_ptr, mp_srcptr s_ptr, mp_size_t s_size, +C unsigned cnt); +C +C The "cnt" parameter is either 16 bits or 32 bits depending on +C SIZEOF_UNSIGNED (see ABI notes in mpn/m68k/README). The value is of +C course only 1 to 31. When loaded as 16 bits there's garbage in the upper +C half, hence the use of cmpw. The shift instructions take the their count +C modulo 64, so the upper part doesn't matter to them either. +C + +C INPUT PARAMETERS +C res_ptr (sp + 4) +C s_ptr (sp + 8) +C s_size (sp + 12) +C cnt (sp + 16) + +define(res_ptr, `a1') +define(s_ptr, `a0') +define(s_size, `d6') +define(cnt, `d4') + +ifdef(`SIZEOF_UNSIGNED',, +`m4_error(`SIZEOF_UNSIGNED not defined, should be in config.m4 +')') + +PROLOGUE(mpn_lshift) +C Save used registers on the stack. + moveml d2-d6/a2, M(-,sp) + +C Copy the arguments to registers. + movel M(sp,28), res_ptr + movel M(sp,32), s_ptr + movel M(sp,36), s_size +ifelse(SIZEOF_UNSIGNED,2, +` movew M(sp,40), cnt', +` movel M(sp,40), cnt') + + moveql #1, d5 + cmpw d5, cnt + bne L(Lnormal) + cmpl s_ptr, res_ptr + bls L(Lspecial) C jump if s_ptr >= res_ptr + +ifelse(scale_available_p,1,` + lea M(s_ptr,s_size,l,4), a2 +',` + movel s_size, d0 + asll #2, d0 + lea M(s_ptr,d0,l), a2 +') + cmpl res_ptr, a2 + bls L(Lspecial) C jump if res_ptr >= s_ptr + s_size + +L(Lnormal): + moveql #32, d5 + subl cnt, d5 + +ifelse(scale_available_p,1,` + lea M(s_ptr,s_size,l,4), s_ptr + lea M(res_ptr,s_size,l,4), res_ptr +',` + movel s_size, d0 + asll #2, d0 + addl d0, s_ptr + addl d0, res_ptr +') + movel M(-,s_ptr), d2 + movel d2, d0 + lsrl d5, d0 C compute carry limb + + lsll cnt, d2 + movel d2, d1 + subql #1, s_size + beq L(Lend) + lsrl #1, s_size + bcs L(L1) + subql #1, s_size + +L(Loop): + movel M(-,s_ptr), d2 + movel d2, d3 + lsrl d5, d3 + orl d3, d1 + movel d1, M(-,res_ptr) + lsll cnt, d2 +L(L1): + movel M(-,s_ptr), d1 + movel d1, d3 + lsrl d5, d3 + orl d3, d2 + movel d2, M(-,res_ptr) + lsll cnt, d1 + + dbf s_size, L(Loop) + subl #0x10000, s_size + bcc L(Loop) + +L(Lend): + movel d1, M(-,res_ptr) C store least significant limb + +C Restore used registers from stack frame. + moveml M(sp,+), d2-d6/a2 + rts + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(Lspecial): + clrl d0 C initialize carry + eorw #1, s_size + lsrl #1, s_size + bcc L(LL1) + subql #1, s_size + +L(LLoop): + movel M(s_ptr,+), d2 + addxl d2, d2 + movel d2, M(res_ptr,+) +L(LL1): + movel M(s_ptr,+), d2 + addxl d2, d2 + movel d2, M(res_ptr,+) + + dbf s_size, L(LLoop) + addxl d0, d0 C save cy in lsb + subl #0x10000, s_size + bcs L(LLend) + lsrl #1, d0 C restore cy + bra L(LLoop) + +L(LLend): +C Restore used registers from stack frame. + moveml M(sp,+), d2-d6/a2 + rts + +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/m68k/m68k-defs.m4 b/gmp-6.3.0/mpn/m68k/m68k-defs.m4 new file mode 100644 index 0000000..15289f6 --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/m68k-defs.m4 @@ -0,0 +1,230 @@ +divert(-1) + +dnl m4 macros for 68k assembler. + +dnl Copyright 2001-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The default m4 `#' commenting interferes with the assembler syntax for +dnl immediates. `|' would be correct, but it interferes with "||" in +dnl eval(). Would like to disable commenting, but that's not possible (see +dnl mpn/asm-defs.m4), so use `;' which should be harmless. + +changecom(;) + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl Same as the standard PROLOGUE, but align to 2 bytes not 4. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) +` TEXT + ALIGN(2) + GLOBL `$1' GLOBL_ATTR + TYPE(`$1',`function') +`$1'LABEL_SUFFIX') + + +dnl Usage: d0, etc +dnl +dnl Expand to d0 or %d0 according to the assembler's requirements. +dnl +dnl Actually d0 expands to `d0' or %`d0', the quotes protecting against +dnl further expansion. Definitions are made even if d0 is to be just `d0', +dnl so that any m4 quoting problems will show up everywhere, not just on a +dnl %d0 system. +dnl +dnl Care must be taken with quoting when using these in a definition. For +dnl instance the quotes in the following are essential or two %'s will be +dnl produced when `counter' is used. +dnl +dnl define(counter, `d7') +dnl + +dnl Called: m68k_reg(r) +define(m68k_reg, +m4_assert_numargs(1) +m4_assert_defined(`WANT_REGISTER_PERCENT') +`ifelse(WANT_REGISTER_PERCENT,yes,%)`$1'') + +dnl Usage: m68k_defreg(r) +define(m68k_defreg, +m4_assert_numargs(1) +`deflit($1,`m68k_reg(`$1')')') + +m68k_defreg(d0) +m68k_defreg(d1) +m68k_defreg(d2) +m68k_defreg(d3) +m68k_defreg(d4) +m68k_defreg(d5) +m68k_defreg(d6) +m68k_defreg(d7) + +m68k_defreg(a0) +m68k_defreg(a1) +m68k_defreg(a2) +m68k_defreg(a3) +m68k_defreg(a4) +m68k_defreg(a5) +m68k_defreg(a6) +m68k_defreg(a7) + +m68k_defreg(sp) +m68k_defreg(pc) + + +dnl Usage: M(base) +dnl M(base,displacement) +dnl M(base,index,size) +dnl M(base,index,size,scale) +dnl M(base,+) +dnl M(-,base) +dnl +dnl `base' is an address register, `index' is a data register, `size' is w +dnl or l, and scale is 1, 2, 4 or 8. +dnl +dnl M(-,base) has it's arguments that way around to emphasise it's a +dnl pre-decrement, as opposed to M(base,+) a post-increment. +dnl +dnl Enhancement: Add the memory indirect modes, if/when they're needed. + +define(M, +m4_assert_numargs_range(1,4) +m4_assert_defined(`WANT_ADDRESSING') +`ifelse(WANT_ADDRESSING,mit, +`ifelse($#,1, ``$1'@')dnl +ifelse($#,2, +`ifelse($2,+, ``$1'@+', +`ifelse($1,-, ``$2'@-', + ``$1'@($2)')')')dnl +ifelse($#,3, ``$1'@(`$2':`$3')')dnl +ifelse($#,4, ``$1'@(`$2':`$3':$4)')', + +dnl WANT_ADDRESSING `motorola' +`ifelse($#,1, `(`$1')')dnl +ifelse($#,2, +`ifelse($2,+, `(`$1')+', +`ifelse($1,-, `-(`$2')', + `$2(`$1')')')')dnl +ifelse($#,3, `(`$1',`$2'.$3)')dnl +ifelse($#,4, `(`$1',`$2'.$3*$4)')')') + + +dnl Usage: addl etc +dnl +dnl m68k instructions with special handling for the suffix, with for +dnl instance addl expanding to addl or add.l as necessary. +dnl +dnl See also t-m68k-defs.pl which verifies all mnemonics used in the asm +dnl files have entries here. + +dnl Called: m68k_insn(mnemonic,suffix) +define(m68k_insn, +m4_assert_numargs(2) +m4_assert_defined(`WANT_DOT_SIZE') +`ifelse(WANT_DOT_SIZE,yes, ``$1'.``$2''', + ``$1$2'')') + +dnl Usage: m68k_definsn(mnemonic,suffix) +define(m68k_definsn, +m4_assert_numargs(2) +`deflit($1`'$2,`m68k_insn(`$1',`$2')')') + +m68k_definsn(add, l) +m68k_definsn(addx, l) +m68k_definsn(addq, l) +m68k_definsn(asl, l) +m68k_definsn(cmp, l) +m68k_definsn(cmp, w) +m68k_definsn(clr, l) +m68k_definsn(divu, l) +m68k_definsn(eor, w) +m68k_definsn(lsl, l) +m68k_definsn(lsr, l) +m68k_definsn(move, l) +m68k_definsn(move, w) +m68k_definsn(movem,l) +m68k_definsn(moveq,l) +m68k_definsn(mulu, l) +m68k_definsn(neg, l) +m68k_definsn(or, l) +m68k_definsn(roxl, l) +m68k_definsn(roxr, l) +m68k_definsn(sub, l) +m68k_definsn(subx, l) +m68k_definsn(subq, l) + + +dnl Usage: bra etc +dnl +dnl Expand to `bra', `jra' or `jbra' according to what the assembler will +dnl accept. The latter two give variable-sized branches in gas. +dnl +dnl See also t-m68k-defs.pl which verifies all the bXX branches used in the +dnl asm files have entries here. + +dnl Called: m68k_branch(cond) +define(m68k_branch, +m4_assert_numargs(1) +m4_assert_defined(`WANT_BRANCHES') +`ifelse(WANT_BRANCHES,jra, `j$1', +`ifelse(WANT_BRANCHES,jbra,`jb$1', + ``b$1'')')') + +dnl Called: m68k_defbranch(cond) +define(m68k_defbranch, +m4_assert_numargs(1) +`deflit(b$1,`m68k_branch(`$1')')') + +m68k_defbranch(ra) +m68k_defbranch(cc) +m68k_defbranch(cs) +m68k_defbranch(ls) +m68k_defbranch(eq) +m68k_defbranch(ne) + + +dnl Usage: scale_available_p +dnl +dnl Expand to 1 if a scale factor can be used in addressing modes, or 0 if +dnl not. M(a0,d0,l,4), meaning a0+d0*4, is not available in 68000 or +dnl 68010, but is in CPU32 and in 68020 and up. + +define(scale_available_p, +`m4_ifdef_anyof_p( +`HAVE_HOST_CPU_m68360' +`HAVE_HOST_CPU_m68020' +`HAVE_HOST_CPU_m68030' +`HAVE_HOST_CPU_m68040' +`HAVE_HOST_CPU_m68060')') + + +divert diff --git a/gmp-6.3.0/mpn/m68k/mc68020/aorsmul_1.asm b/gmp-6.3.0/mpn/m68k/mc68020/aorsmul_1.asm new file mode 100644 index 0000000..4ee30ad --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/mc68020/aorsmul_1.asm @@ -0,0 +1,101 @@ +dnl mc68020 mpn_addmul_1, mpn_submul_1 -- add or subtract mpn multiple. + +dnl Copyright 1992, 1994, 1996, 1999-2002, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 68040: 25 + +ifdef(`OPERATION_addmul_1',` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) +',`ifdef(`OPERATION_submul_1',` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) +', +`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + + +C INPUT PARAMETERS +C res_ptr (sp + 4) +C s1_ptr (sp + 8) +C s1_size (sp + 12) +C s2_limb (sp + 16) + +define(res_ptr, `a0') +define(s1_ptr, `a1') +define(s1_size, `d2') +define(s2_limb, `d4') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +PROLOGUE(M4_function_1) + +C Save used registers on the stack. + moveml d2-d5, M(-,sp) + +C Copy the arguments to registers. Better use movem? + movel M(sp,20), res_ptr + movel M(sp,24), s1_ptr + movel M(sp,28), s1_size + movel M(sp,32), s2_limb + + eorw #1, s1_size + clrl d1 + clrl d5 + lsrl #1, s1_size + bcc L(L1) + subql #1, s1_size + subl d0, d0 C (d0,cy) <= (0,0) + +L(Loop): + movel M(s1_ptr,+), d3 + mulul s2_limb, d1:d3 + addxl d0, d3 + addxl d5, d1 + M4_inst d3, M(res_ptr,+) +L(L1): movel M(s1_ptr,+), d3 + mulul s2_limb, d0:d3 + addxl d1, d3 + addxl d5, d0 + M4_inst d3, M(res_ptr,+) + + dbf s1_size, L(Loop) + addxl d5, d0 + subl #0x10000, s1_size + bcc L(Loop) + +C Restore used registers from stack frame. + moveml M(sp,+), d2-d5 + + rts + +EPILOGUE(M4_function_1) diff --git a/gmp-6.3.0/mpn/m68k/mc68020/mul_1.asm b/gmp-6.3.0/mpn/m68k/mc68020/mul_1.asm new file mode 100644 index 0000000..f5fbb30 --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/mc68020/mul_1.asm @@ -0,0 +1,96 @@ +dnl mc68020 mpn_mul_1 -- mpn by limb multiply + +dnl Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 68040: 24 + +C INPUT PARAMETERS +C res_ptr (sp + 4) +C s1_ptr (sp + 8) +C s1_size (sp + 12) +C s2_limb (sp + 16) + + +define(res_ptr, `a0') +define(s1_ptr, `a1') +define(s1_size, `d2') +define(s2_limb, `d4') + + +PROLOGUE(mpn_mul_1) + +C Save used registers on the stack. + moveml d2-d4, M(-,sp) + +C movel d2, M(-,sp) +C movel d3, M(-,sp) +C movel d4, M(-,sp) + +C Copy the arguments to registers. Better use movem? + movel M(sp,16), res_ptr + movel M(sp,20), s1_ptr + movel M(sp,24), s1_size + movel M(sp,28), s2_limb + + eorw #1, s1_size + clrl d1 + lsrl #1, s1_size + bcc L(L1) + subql #1, s1_size + subl d0, d0 C (d0,cy) <= (0,0) + +L(Loop): + movel M(s1_ptr,+), d3 + mulul s2_limb, d1:d3 + addxl d0, d3 + movel d3, M(res_ptr,+) +L(L1): movel M(s1_ptr,+), d3 + mulul s2_limb, d0:d3 + addxl d1, d3 + movel d3, M(res_ptr,+) + + dbf s1_size, L(Loop) + clrl d3 + addxl d3, d0 + subl #0x10000, s1_size + bcc L(Loop) + +C Restore used registers from stack frame. + moveml M(sp,+), d2-d4 + +C movel M(sp,+),d4 +C movel M(sp,+),d3 +C movel M(sp,+),d2 + + rts + +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/m68k/mc68020/udiv.asm b/gmp-6.3.0/mpn/m68k/mc68020/udiv.asm new file mode 100644 index 0000000..aadeab9 --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/mc68020/udiv.asm @@ -0,0 +1,45 @@ +dnl mc68020 mpn_udiv_qrnnd -- 2x1 limb division + +dnl Copyright 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *rp, +C mp_limb_t nh, mp_limb_t nl, mp_limb_t d); +C + +PROLOGUE(mpn_udiv_qrnnd) + movel M(sp,4), a0 C rp + movel M(sp,8), d1 C nh + movel M(sp,12), d0 C nl + divul M(sp,16), d1:d0 + movel d1, M(a0) C r + rts +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/m68k/mc68020/umul.asm b/gmp-6.3.0/mpn/m68k/mc68020/umul.asm new file mode 100644 index 0000000..f19314e --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/mc68020/umul.asm @@ -0,0 +1,44 @@ +dnl mc68020 mpn_umul_ppmm -- limb by limb multiplication + +dnl Copyright 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C mp_limb_t mpn_umul_ppmm (mp_limb_t *lp, mp_limb_t x, mp_limb_t y); +C + +PROLOGUE(mpn_umul_ppmm) + movel M(sp,4), a0 C lp + movel M(sp,8), d1 C x + movel M(sp,12), d0 C y + mulul d0, d0:d1 + movel d1, M(a0) C low + rts +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/m68k/rshift.asm b/gmp-6.3.0/mpn/m68k/rshift.asm new file mode 100644 index 0000000..21b5f89 --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/rshift.asm @@ -0,0 +1,175 @@ +dnl mc68020 mpn_rshift -- mpn right shift. + +dnl Copyright 1996, 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C shift==1 shift>1 +C 68040: 9 12 + + +C mp_limb_t mpn_rshift (mp_ptr res_ptr, mp_srcptr s_ptr, mp_size_t s_size, +C unsigned cnt); +C +C The "cnt" parameter is either 16 bits or 32 bits depending on +C SIZEOF_UNSIGNED (see ABI notes in mpn/m68k/README). The value is of +C course only 1 to 31. When loaded as 16 bits there's garbage in the upper +C half, hence the use of cmpw. The shift instructions take the their count +C modulo 64, so the upper part doesn't matter to them either. +C + +C INPUT PARAMETERS +C res_ptr (sp + 4) +C s_ptr (sp + 8) +C s_size (sp + 12) +C cnt (sp + 16) + +define(res_ptr, `a1') +define(s_ptr, `a0') +define(s_size, `d6') +define(cnt, `d4') + +ifdef(`SIZEOF_UNSIGNED',, +`m4_error(`SIZEOF_UNSIGNED not defined, should be in config.m4 +')') + +PROLOGUE(mpn_rshift) +C Save used registers on the stack. + moveml d2-d6/a2, M(-,sp) + +C Copy the arguments to registers. + movel M(sp,28), res_ptr + movel M(sp,32), s_ptr + movel M(sp,36), s_size +ifelse(SIZEOF_UNSIGNED,2, +` movew M(sp,40), cnt', +` movel M(sp,40), cnt') + + moveql #1, d5 + cmpw d5, cnt + bne L(Lnormal) + cmpl res_ptr, s_ptr + bls L(Lspecial) C jump if res_ptr >= s_ptr + +ifelse(scale_available_p,1,` + lea M(res_ptr,s_size,l,4), a2 +',` + movel s_size, d0 + asll #2, d0 + lea M(res_ptr,d0,l), a2 +') + cmpl s_ptr, a2 + bls L(Lspecial) C jump if s_ptr >= res_ptr + s_size + +L(Lnormal): + moveql #32, d5 + subl cnt, d5 + movel M(s_ptr,+), d2 + movel d2, d0 + lsll d5, d0 C compute carry limb + + lsrl cnt, d2 + movel d2, d1 + subql #1, s_size + beq L(Lend) + lsrl #1, s_size + bcs L(L1) + subql #1, s_size + +L(Loop): + movel M(s_ptr,+), d2 + movel d2, d3 + lsll d5, d3 + orl d3, d1 + movel d1, M(res_ptr,+) + lsrl cnt, d2 +L(L1): + movel M(s_ptr,+), d1 + movel d1, d3 + lsll d5, d3 + orl d3, d2 + movel d2, M(res_ptr,+) + lsrl cnt, d1 + + dbf s_size, L(Loop) + subl #0x10000, s_size + bcc L(Loop) + +L(Lend): + movel d1, M(res_ptr) C store most significant limb + +C Restore used registers from stack frame. + moveml M(sp,+), d2-d6/a2 + rts + +C We loop from most significant end of the arrays, which is only permissable +C if the source and destination don't overlap, since the function is +C documented to work for overlapping source and destination. + +L(Lspecial): +ifelse(scale_available_p,1,` + lea M(s_ptr,s_size,l,4), s_ptr + lea M(res_ptr,s_size,l,4), res_ptr +',` + movel s_size, d0 + asll #2, d0 + addl d0, s_ptr + addl d0, res_ptr +') + + clrl d0 C initialize carry + eorw #1, s_size + lsrl #1, s_size + bcc L(LL1) + subql #1, s_size + +L(LLoop): + movel M(-,s_ptr), d2 + roxrl #1, d2 + movel d2, M(-,res_ptr) +L(LL1): + movel M(-,s_ptr), d2 + roxrl #1, d2 + movel d2, M(-,res_ptr) + + dbf s_size, L(LLoop) + roxrl #1, d0 C save cy in msb + subl #0x10000, s_size + bcs L(LLend) + addl d0, d0 C restore cy + bra L(LLoop) + +L(LLend): +C Restore used registers from stack frame. + moveml M(sp,+), d2-d6/a2 + rts + +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/m68k/t-m68k-defs.pl b/gmp-6.3.0/mpn/m68k/t-m68k-defs.pl new file mode 100644 index 0000000..91c21fa --- /dev/null +++ b/gmp-6.3.0/mpn/m68k/t-m68k-defs.pl @@ -0,0 +1,91 @@ +#! /usr/bin/perl -w + +# Copyright 2001, 2003 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: perl t-m68k-defs.pl [-t] +# +# Run this in the mpn/m68k source directory to check that m68k-defs.m4 has +# m68k_defbranch()s or m68k_definsn()s for each instruction used in *.asm +# and */*.asm. Print nothing if everything is ok. The -t option prints +# some diagnostic traces. + +use strict; +use Getopt::Std; + +my %opt; +getopts('t', \%opt); + +my %branch; +my %insn; + +open(FD, ") { + if (/^m68k_defbranch\(\s*(.*)\)/) { $branch{"b".$1} = 1; } + if (/^m68k_definsn\(\s*(.*),\s*(.*)\)/) { $insn{$1.$2} = 1; } +} +close(FD); + +print "branches: ", join(" ",keys(%branch)), "\n" if $opt{'t'}; +print "insns: ", join(" ",keys(%insn)), "\n" if $opt{'t'}; + + +foreach my $file (glob("*.asm"), glob("*/*.asm")) { + print "file $file\n" if $opt{'t'}; + + open(FD, "<$file") or die "Cannot open $file: $!"; + while () { + if (/^[ \t]*C/) { next; }; + if (/^\t([a-z0-9]+)/) { + my $opcode = $1; + print "opcode $1\n" if $opt{'t'}; + + # instructions with an l, w or b suffix should have a definsn + # (unless they're already a defbranch) + if ($opcode =~ /[lwb]$/ + && ! defined $insn{$opcode} + && ! defined $branch{$opcode}) + { + print "$file: $.: missing m68k_definsn: $opcode\n"; + } + + # instructions bXX should have a defbranch (unless they're + # already a definsn) + if ($opcode =~ /^b/ + && ! defined $insn{$opcode} + && ! defined $branch{$opcode}) + { + print "$file: $.: missing m68k_defbranch: $opcode\n"; + } + } + } + close(FD); +} diff --git a/gmp-6.3.0/mpn/m88k/README b/gmp-6.3.0/mpn/m88k/README new file mode 100644 index 0000000..1b51e83 --- /dev/null +++ b/gmp-6.3.0/mpn/m88k/README @@ -0,0 +1,61 @@ +Copyright 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + M88K MPN SUBROUTINES + +This directory contains mpn functions for various m88k family chips. + +CODE ORGANIZATION + + m88k m88000, m88100 + m88k/mc88110 m88110 + +STATUS + +The code herein is old and poorly maintained. + +* The .s files assume the system uses a "_" underscore prefix, which + should be controlled by configure. + +* The mc88110/*.S files are using the defunct "sysdep.h" configuration + scheme and won't compile. + +Conversion to the current m4 .asm style wouldn't be difficult. + + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/m88k/add_n.s b/gmp-6.3.0/mpn/m88k/add_n.s new file mode 100644 index 0000000..dbdb22f --- /dev/null +++ b/gmp-6.3.0/mpn/m88k/add_n.s @@ -0,0 +1,113 @@ +; mc88100 mpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. +; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of either: +; +; * the GNU Lesser General Public License as published by the Free +; Software Foundation; either version 3 of the License, or (at your +; option) any later version. +; +; or +; +; * the GNU General Public License as published by the Free Software +; Foundation; either version 2 of the License, or (at your option) any +; later version. +; +; or both in parallel, as here. +; +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received copies of the GNU General Public License and the +; GNU Lesser General Public License along with the GNU MP Library. If not, +; see https://www.gnu.org/licenses/. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; s2_ptr r4 +; size r5 + +; This code has been optimized to run one instruction per clock, avoiding +; load stalls and writeback contention. As a result, the instruction +; order is not always natural. + +; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, +; but on the 88110, it seems to run much slower, 6.6 clocks/limb. + + text + align 16 + global ___gmpn_add_n +___gmpn_add_n: + ld r6,r3,0 ; read first limb from s1_ptr + extu r10,r5,3 + ld r7,r4,0 ; read first limb from s2_ptr + + subu.co r5,r0,r5 ; (clear carry as side effect) + mak r5,r5,3<4> + bcnd eq0,r5,Lzero + + or r12,r0,lo16(Lbase) + or.u r12,r12,hi16(Lbase) + addu r12,r12,r5 ; r12 is address for entering in loop + + extu r5,r5,2 ; divide by 4 + subu r2,r2,r5 ; adjust res_ptr + subu r3,r3,r5 ; adjust s1_ptr + subu r4,r4,r5 ; adjust s2_ptr + + or r8,r6,r0 + + jmp.n r12 + or r9,r7,r0 + +Loop: addu r3,r3,32 + st r8,r2,28 + addu r4,r4,32 + ld r6,r3,0 + addu r2,r2,32 + ld r7,r4,0 +Lzero: subu r10,r10,1 ; add 0 + 8r limbs (adj loop cnt) +Lbase: ld r8,r3,4 + addu.cio r6,r6,r7 + ld r9,r4,4 + st r6,r2,0 + ld r6,r3,8 ; add 7 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,8 + st r8,r2,4 + ld r8,r3,12 ; add 6 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,12 + st r6,r2,8 + ld r6,r3,16 ; add 5 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,16 + st r8,r2,12 + ld r8,r3,20 ; add 4 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,20 + st r6,r2,16 + ld r6,r3,24 ; add 3 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,24 + st r8,r2,20 + ld r8,r3,28 ; add 2 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,28 + st r6,r2,24 + bcnd.n ne0,r10,Loop ; add 1 + 8r limbs + addu.cio r8,r8,r9 + + st r8,r2,28 ; store most significant limb + + jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb diff --git a/gmp-6.3.0/mpn/m88k/mc88110/add_n.S b/gmp-6.3.0/mpn/m88k/mc88110/add_n.S new file mode 100644 index 0000000..c3b12b3 --- /dev/null +++ b/gmp-6.3.0/mpn/m88k/mc88110/add_n.S @@ -0,0 +1,209 @@ +; mc88110 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. +; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of either: +; +; * the GNU Lesser General Public License as published by the Free +; Software Foundation; either version 3 of the License, or (at your +; option) any later version. +; +; or +; +; * the GNU General Public License as published by the Free Software +; Foundation; either version 2 of the License, or (at your option) any +; later version. +; +; or both in parallel, as here. +; +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received copies of the GNU General Public License and the +; GNU Lesser General Public License along with the GNU MP Library. If not, +; see https://www.gnu.org/licenses/. + + +; INPUT PARAMETERS +#define res_ptr r2 +#define s1_ptr r3 +#define s2_ptr r4 +#define size r5 + +#include "sysdep.h" + + text + align 16 + global C_SYMBOL_NAME(__gmpn_add_n) +C_SYMBOL_NAME(__gmpn_add_n): + addu.co r0,r0,r0 ; clear cy flag + xor r12,s2_ptr,res_ptr + bb1 2,r12,L1 +; ** V1a ** +L0: bb0 2,res_ptr,L_v1 ; branch if res_ptr is aligned? +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + addu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s1_ptr,0 + ld r12,s1_ptr,4 + ld.d r8,s2_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1: subu size,size,8 + addu.cio r6,r10,r8 + ld r10,s1_ptr,8 + addu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + addu.cio r6,r10,r8 + ld r10,s1_ptr,16 + addu.cio r7,r12,r9 + ld r12,s1_ptr,20 + ld.d r8,s2_ptr,16 + st.d r6,res_ptr,8 + addu.cio r6,r10,r8 + ld r10,s1_ptr,24 + addu.cio r7,r12,r9 + ld r12,s1_ptr,28 + ld.d r8,s2_ptr,24 + st.d r6,res_ptr,16 + addu.cio r6,r10,r8 + ld r10,s1_ptr,32 + addu.cio r7,r12,r9 + ld r12,s1_ptr,36 + addu s1_ptr,s1_ptr,32 + ld.d r8,s2_ptr,32 + addu s2_ptr,s2_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1 + +Lfin1: addu size,size,8-2 + bcnd lt0,size,Lend1 +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1: addu.cio r6,r10,r8 + ld r10,s1_ptr,8 + addu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1 +Lend1: addu.cio r6,r10,r8 + addu.cio r7,r12,r9 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1 +/* Add last limb */ + ld r10,s1_ptr,8 + ld r8,s2_ptr,8 + addu.cio r6,r10,r8 + st r6,res_ptr,8 + +Lret1: jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb + +L1: xor r12,s1_ptr,res_ptr + bb1 2,r12,L2 +; ** V1b ** + or r12,r0,s2_ptr + or s2_ptr,r0,s1_ptr + or s1_ptr,r0,r12 + br L0 + +; ** V2 ** +/* If we come here, the alignment of s1_ptr and res_ptr as well as the + alignment of s2_ptr and res_ptr differ. Since there are only two ways + things can be aligned (that we care about) we now know that the alignment + of s1_ptr and s2_ptr are the same. */ + +L2: cmp r12,size,1 + bb1 eq,r12,Ljone + bb0 2,s1_ptr,L_v2 ; branch if s1_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + addu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 + +L_v2: subu size,size,8 + bcnd lt0,size,Lfin2 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop2: subu size,size,8 + ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + addu.cio r8,r8,r6 + st r8,res_ptr,0 + addu.cio r9,r9,r7 + st r9,res_ptr,4 + ld.d r8,s1_ptr,8 + ld.d r6,s2_ptr,8 + addu.cio r8,r8,r6 + st r8,res_ptr,8 + addu.cio r9,r9,r7 + st r9,res_ptr,12 + ld.d r8,s1_ptr,16 + ld.d r6,s2_ptr,16 + addu.cio r8,r8,r6 + st r8,res_ptr,16 + addu.cio r9,r9,r7 + st r9,res_ptr,20 + ld.d r8,s1_ptr,24 + ld.d r6,s2_ptr,24 + addu.cio r8,r8,r6 + st r8,res_ptr,24 + addu.cio r9,r9,r7 + st r9,res_ptr,28 + addu s1_ptr,s1_ptr,32 + addu s2_ptr,s2_ptr,32 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop2 + +Lfin2: addu size,size,8-2 + bcnd lt0,size,Lend2 +Loope2: ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + addu.cio r8,r8,r6 + st r8,res_ptr,0 + addu.cio r9,r9,r7 + st r9,res_ptr,4 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope2 +Lend2: bb0 0,size,Lret2 +/* Add last limb */ +Ljone: ld r10,s1_ptr,0 + ld r8,s2_ptr,0 + addu.cio r6,r10,r8 + st r6,res_ptr,0 + +Lret2: jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb diff --git a/gmp-6.3.0/mpn/m88k/mc88110/addmul_1.s b/gmp-6.3.0/mpn/m88k/mc88110/addmul_1.s new file mode 100644 index 0000000..321221f --- /dev/null +++ b/gmp-6.3.0/mpn/m88k/mc88110/addmul_1.s @@ -0,0 +1,70 @@ +; mc88110 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright 1996, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. +; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of either: +; +; * the GNU Lesser General Public License as published by the Free +; Software Foundation; either version 3 of the License, or (at your +; option) any later version. +; +; or +; +; * the GNU General Public License as published by the Free Software +; Foundation; either version 2 of the License, or (at your option) any +; later version. +; +; or both in parallel, as here. +; +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received copies of the GNU General Public License and the +; GNU Lesser General Public License along with the GNU MP Library. If not, +; see https://www.gnu.org/licenses/. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + + text + align 16 + global ___gmpn_addmul_1 +___gmpn_addmul_1: + lda r3,r3[r4] + lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval + subu r4,r0,r4 + addu.co r2,r0,r0 ; r2 = cy = 0 + + ld r6,r3[r4] + addu r4,r4,1 + subu r8,r8,4 + bcnd.n eq0,r4,Lend + mulu.d r10,r6,r5 + +Loop: ld r7,r8[r4] + ld r6,r3[r4] + addu.cio r9,r11,r2 + addu.ci r2,r10,r0 + addu.co r9,r9,r7 + st r9,r8[r4] + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd ne0,r4,Loop + +Lend: ld r7,r8,0 + addu.cio r9,r11,r2 + addu.ci r2,r10,r0 + addu.co r9,r9,r7 + st r9,r8,0 + jmp.n r1 + addu.ci r2,r2,r0 diff --git a/gmp-6.3.0/mpn/m88k/mc88110/mul_1.s b/gmp-6.3.0/mpn/m88k/mc88110/mul_1.s new file mode 100644 index 0000000..28fd14b --- /dev/null +++ b/gmp-6.3.0/mpn/m88k/mc88110/mul_1.s @@ -0,0 +1,68 @@ +; mc88110 __gmpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. +; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of either: +; +; * the GNU Lesser General Public License as published by the Free +; Software Foundation; either version 3 of the License, or (at your +; option) any later version. +; +; or +; +; * the GNU General Public License as published by the Free Software +; Foundation; either version 2 of the License, or (at your option) any +; later version. +; +; or both in parallel, as here. +; +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received copies of the GNU General Public License and the +; GNU Lesser General Public License along with the GNU MP Library. If not, +; see https://www.gnu.org/licenses/. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + + text + align 16 + global ___gmpn_mul_1 +___gmpn_mul_1: + ; Make S1_PTR and RES_PTR point at the end of their blocks + ; and negate SIZE. + lda r3,r3[r4] + lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval + subu r4,r0,r4 + + addu.co r2,r0,r0 ; r2 = cy = 0 + + ld r6,r3[r4] + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd.n eq0,r4,Lend + subu r8,r8,8 + +Loop: ld r6,r3[r4] + addu.cio r9,r11,r2 + or r2,r10,r0 ; could be avoided if unrolled + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd.n ne0,r4,Loop + st r9,r8[r4] + +Lend: addu.cio r9,r11,r2 + st r9,r8,4 + jmp.n r1 + addu.ci r2,r10,r0 diff --git a/gmp-6.3.0/mpn/m88k/mc88110/sub_n.S b/gmp-6.3.0/mpn/m88k/mc88110/sub_n.S new file mode 100644 index 0000000..f0a8ecb --- /dev/null +++ b/gmp-6.3.0/mpn/m88k/mc88110/sub_n.S @@ -0,0 +1,285 @@ +; mc88110 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. +; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of either: +; +; * the GNU Lesser General Public License as published by the Free +; Software Foundation; either version 3 of the License, or (at your +; option) any later version. +; +; or +; +; * the GNU General Public License as published by the Free Software +; Foundation; either version 2 of the License, or (at your option) any +; later version. +; +; or both in parallel, as here. +; +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received copies of the GNU General Public License and the +; GNU Lesser General Public License along with the GNU MP Library. If not, +; see https://www.gnu.org/licenses/. + + +; INPUT PARAMETERS +#define res_ptr r2 +#define s1_ptr r3 +#define s2_ptr r4 +#define size r5 + +#include "sysdep.h" + + text + align 16 + global C_SYMBOL_NAME(__gmpn_sub_n) +C_SYMBOL_NAME(__gmpn_sub_n): + subu.co r0,r0,r0 ; set cy flag + xor r12,s2_ptr,res_ptr + bb1 2,r12,L1 +; ** V1a ** +L0: bb0 2,res_ptr,L_v1 ; branch if res_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + subu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s1_ptr,0 + ld r12,s1_ptr,4 + ld.d r8,s2_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1: subu size,size,8 + subu.cio r6,r10,r8 + ld r10,s1_ptr,8 + subu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu.cio r6,r10,r8 + ld r10,s1_ptr,16 + subu.cio r7,r12,r9 + ld r12,s1_ptr,20 + ld.d r8,s2_ptr,16 + st.d r6,res_ptr,8 + subu.cio r6,r10,r8 + ld r10,s1_ptr,24 + subu.cio r7,r12,r9 + ld r12,s1_ptr,28 + ld.d r8,s2_ptr,24 + st.d r6,res_ptr,16 + subu.cio r6,r10,r8 + ld r10,s1_ptr,32 + subu.cio r7,r12,r9 + ld r12,s1_ptr,36 + addu s1_ptr,s1_ptr,32 + ld.d r8,s2_ptr,32 + addu s2_ptr,s2_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1 + +Lfin1: addu size,size,8-2 + bcnd lt0,size,Lend1 +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1: subu.cio r6,r10,r8 + ld r10,s1_ptr,8 + subu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1 +Lend1: subu.cio r6,r10,r8 + subu.cio r7,r12,r9 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1 +/* Add last limb */ + ld r10,s1_ptr,8 + ld r8,s2_ptr,8 + subu.cio r6,r10,r8 + st r6,res_ptr,8 + +Lret1: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 + +L1: xor r12,s1_ptr,res_ptr + bb1 2,r12,L2 +; ** V1b ** + bb0 2,res_ptr,L_v1b ; branch if res_ptr is aligned +/* Add least significant limb separately to align res_ptr and s1_ptr */ + ld r10,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + ld r8,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + subu size,size,1 + subu.co r6,r8,r10 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1b: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s2_ptr,0 + ld r12,s2_ptr,4 + ld.d r8,s1_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1b +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1b: subu size,size,8 + subu.cio r6,r8,r10 + ld r10,s2_ptr,8 + subu.cio r7,r9,r12 + ld r12,s2_ptr,12 + ld.d r8,s1_ptr,8 + st.d r6,res_ptr,0 + subu.cio r6,r8,r10 + ld r10,s2_ptr,16 + subu.cio r7,r9,r12 + ld r12,s2_ptr,20 + ld.d r8,s1_ptr,16 + st.d r6,res_ptr,8 + subu.cio r6,r8,r10 + ld r10,s2_ptr,24 + subu.cio r7,r9,r12 + ld r12,s2_ptr,28 + ld.d r8,s1_ptr,24 + st.d r6,res_ptr,16 + subu.cio r6,r8,r10 + ld r10,s2_ptr,32 + subu.cio r7,r9,r12 + ld r12,s2_ptr,36 + addu s2_ptr,s2_ptr,32 + ld.d r8,s1_ptr,32 + addu s1_ptr,s1_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1b + +Lfin1b: addu size,size,8-2 + bcnd lt0,size,Lend1b +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1b:subu.cio r6,r8,r10 + ld r10,s2_ptr,8 + subu.cio r7,r9,r12 + ld r12,s2_ptr,12 + ld.d r8,s1_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1b +Lend1b: subu.cio r6,r8,r10 + subu.cio r7,r9,r12 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1b +/* Add last limb */ + ld r10,s2_ptr,8 + ld r8,s1_ptr,8 + subu.cio r6,r8,r10 + st r6,res_ptr,8 + +Lret1b: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 + +; ** V2 ** +/* If we come here, the alignment of s1_ptr and res_ptr as well as the + alignment of s2_ptr and res_ptr differ. Since there are only two ways + things can be aligned (that we care about) we now know that the alignment + of s1_ptr and s2_ptr are the same. */ + +L2: cmp r12,size,1 + bb1 eq,r12,Ljone + bb0 2,s1_ptr,L_v2 ; branch if s1_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + subu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 + +L_v2: subu size,size,8 + bcnd lt0,size,Lfin2 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop2: subu size,size,8 + ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + subu.cio r8,r8,r6 + st r8,res_ptr,0 + subu.cio r9,r9,r7 + st r9,res_ptr,4 + ld.d r8,s1_ptr,8 + ld.d r6,s2_ptr,8 + subu.cio r8,r8,r6 + st r8,res_ptr,8 + subu.cio r9,r9,r7 + st r9,res_ptr,12 + ld.d r8,s1_ptr,16 + ld.d r6,s2_ptr,16 + subu.cio r8,r8,r6 + st r8,res_ptr,16 + subu.cio r9,r9,r7 + st r9,res_ptr,20 + ld.d r8,s1_ptr,24 + ld.d r6,s2_ptr,24 + subu.cio r8,r8,r6 + st r8,res_ptr,24 + subu.cio r9,r9,r7 + st r9,res_ptr,28 + addu s1_ptr,s1_ptr,32 + addu s2_ptr,s2_ptr,32 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop2 + +Lfin2: addu size,size,8-2 + bcnd lt0,size,Lend2 +Loope2: ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + subu.cio r8,r8,r6 + st r8,res_ptr,0 + subu.cio r9,r9,r7 + st r9,res_ptr,4 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope2 +Lend2: bb0 0,size,Lret2 +/* Add last limb */ +Ljone: ld r10,s1_ptr,0 + ld r8,s2_ptr,0 + subu.cio r6,r10,r8 + st r6,res_ptr,0 + +Lret2: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 diff --git a/gmp-6.3.0/mpn/m88k/mul_1.s b/gmp-6.3.0/mpn/m88k/mul_1.s new file mode 100644 index 0000000..c8abdc0 --- /dev/null +++ b/gmp-6.3.0/mpn/m88k/mul_1.s @@ -0,0 +1,136 @@ +; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. +; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of either: +; +; * the GNU Lesser General Public License as published by the Free +; Software Foundation; either version 3 of the License, or (at your +; option) any later version. +; +; or +; +; * the GNU General Public License as published by the Free Software +; Foundation; either version 2 of the License, or (at your option) any +; later version. +; +; or both in parallel, as here. +; +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received copies of the GNU General Public License and the +; GNU Lesser General Public License along with the GNU MP Library. If not, +; see https://www.gnu.org/licenses/. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + +; Common overhead is about 11 cycles/invocation. + +; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb. (The +; pipeline stalls 2 cycles due to WB contention.) + +; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb. (The +; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.) + +; To enhance speed: +; 1. Unroll main loop 4-8 times. +; 2. Schedule code to avoid WB contention. It might be tempting to move the +; ld instruction in the loops down to save 2 cycles (less WB contention), +; but that looses because the ultimate value will be read from outside +; the allocated space. But if we handle the ultimate multiplication in +; the tail, we can do this. +; 3. Make the multiplication with less instructions. I think the code for +; (S2_LIMB >= 0x10000) is not minimal. +; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or +; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11 +; cycles/limb. (Assuming infinite unrolling.) + + text + align 16 + global ___gmpn_mul_1 +___gmpn_mul_1: + + ; Make S1_PTR and RES_PTR point at the end of their blocks + ; and negate SIZE. + lda r3,r3[r4] + lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval + subu r4,r0,r4 + + addu.co r2,r0,r0 ; r2 = cy = 0 + ld r9,r3[r4] + mask r7,r5,0xffff ; r7 = lo(S2_LIMB) + extu r8,r5,16 ; r8 = hi(S2_LIMB) + bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0) + subu r6,r6,4 + +; General code for any value of S2_LIMB. + + ; Make a stack frame and save r25 and r26 + subu r31,r31,16 + st.d r25,r31,8 + + ; Enter the loop in the middle + br.n L1 + addu r4,r4,1 + +Loop: ld r9,r3[r4] + st r26,r6[r4] +; bcnd ne0,r0,0 ; bubble + addu r4,r4,1 +L1: mul r26,r9,r5 ; low word of product mul_1 WB ld + mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1 + mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1 + mul r10,r12,r8 ; r10 = prod_1a mul_3 + extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1 + mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1 + mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2 + extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3 + addu r10,r10,r11 ; addu_1 WB extu_2 +; bcnd ne0,r0,0 ; bubble WB addu_1 + addu.co r10,r10,r12 ; WB mul_4 + mask.u r10,r10,0xffff ; move the 16 most significant bits... + addu.ci r10,r10,r0 ; ...to the low half of the word... + rot r10,r10,16 ; ...and put carry in pos 16. + addu.co r26,r26,r2 ; add old carry limb + bcnd.n ne0,r4,Loop + addu.ci r2,r25,r10 ; compute new carry limb + + st r26,r6[r4] + ld.d r25,r31,8 + jmp.n r1 + addu r31,r31,16 + +; Fast code for S2_LIMB < 0x10000 +Lsmall: + ; Enter the loop in the middle + br.n SL1 + addu r4,r4,1 + +SLoop: ld r9,r3[r4] ; + st r8,r6[r4] ; + addu r4,r4,1 ; +SL1: mul r8,r9,r5 ; low word of product + mask r12,r9,0xffff ; r12 = lo(s1_limb) + extu r13,r9,16 ; r13 = hi(s1_limb) + mul r11,r12,r7 ; r11 = prod_0 + mul r12,r13,r7 ; r12 = prod_1b + addu.cio r8,r8,r2 ; add old carry limb + extu r10,r11,16 ; r11 = hi(prod_0) + addu r10,r10,r12 ; + bcnd.n ne0,r4,SLoop + extu r2,r10,16 ; r2 = new carry limb + + jmp.n r1 + st r8,r6[r4] diff --git a/gmp-6.3.0/mpn/m88k/sub_n.s b/gmp-6.3.0/mpn/m88k/sub_n.s new file mode 100644 index 0000000..2bd8f09 --- /dev/null +++ b/gmp-6.3.0/mpn/m88k/sub_n.s @@ -0,0 +1,115 @@ +; mc88100 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. +; +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of either: +; +; * the GNU Lesser General Public License as published by the Free +; Software Foundation; either version 3 of the License, or (at your +; option) any later version. +; +; or +; +; * the GNU General Public License as published by the Free Software +; Foundation; either version 2 of the License, or (at your option) any +; later version. +; +; or both in parallel, as here. +; +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +; for more details. +; +; You should have received copies of the GNU General Public License and the +; GNU Lesser General Public License along with the GNU MP Library. If not, +; see https://www.gnu.org/licenses/. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; s2_ptr r4 +; size r5 + +; This code has been optimized to run one instruction per clock, avoiding +; load stalls and writeback contention. As a result, the instruction +; order is not always natural. + +; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, +; but on the 88110, it seems to run much slower, 6.6 clocks/limb. + + text + align 16 + global ___gmpn_sub_n +___gmpn_sub_n: + ld r6,r3,0 ; read first limb from s1_ptr + extu r10,r5,3 + ld r7,r4,0 ; read first limb from s2_ptr + + subu r5,r0,r5 + mak r5,r5,3<4> + bcnd.n eq0,r5,Lzero + subu.co r0,r0,r0 ; initialize carry + + or r12,r0,lo16(Lbase) + or.u r12,r12,hi16(Lbase) + addu r12,r12,r5 ; r12 is address for entering in loop + + extu r5,r5,2 ; divide by 4 + subu r2,r2,r5 ; adjust res_ptr + subu r3,r3,r5 ; adjust s1_ptr + subu r4,r4,r5 ; adjust s2_ptr + + or r8,r6,r0 + + jmp.n r12 + or r9,r7,r0 + +Loop: addu r3,r3,32 + st r8,r2,28 + addu r4,r4,32 + ld r6,r3,0 + addu r2,r2,32 + ld r7,r4,0 +Lzero: subu r10,r10,1 ; subtract 0 + 8r limbs (adj loop cnt) +Lbase: ld r8,r3,4 + subu.cio r6,r6,r7 + ld r9,r4,4 + st r6,r2,0 + ld r6,r3,8 ; subtract 7 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,8 + st r8,r2,4 + ld r8,r3,12 ; subtract 6 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,12 + st r6,r2,8 + ld r6,r3,16 ; subtract 5 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,16 + st r8,r2,12 + ld r8,r3,20 ; subtract 4 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,20 + st r6,r2,16 + ld r6,r3,24 ; subtract 3 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,24 + st r8,r2,20 + ld r8,r3,28 ; subtract 2 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,28 + st r6,r2,24 + bcnd.n ne0,r10,Loop ; subtract 1 + 8r limbs + subu.cio r8,r8,r9 + + st r8,r2,28 ; store most significant limb + + addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 diff --git a/gmp-6.3.0/mpn/matrix22_mul.c b/gmp-6.3.0/mpn/matrix22_mul.c new file mode 120000 index 0000000..df36303 --- /dev/null +++ b/gmp-6.3.0/mpn/matrix22_mul.c @@ -0,0 +1 @@ +../mpn/generic/matrix22_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/matrix22_mul1_inverse_vector.c b/gmp-6.3.0/mpn/matrix22_mul1_inverse_vector.c new file mode 120000 index 0000000..225610b --- /dev/null +++ b/gmp-6.3.0/mpn/matrix22_mul1_inverse_vector.c @@ -0,0 +1 @@ +../mpn/generic/matrix22_mul1_inverse_vector.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/minithres/gmp-mparam.h b/gmp-6.3.0/mpn/minithres/gmp-mparam.h new file mode 100644 index 0000000..35fcb77 --- /dev/null +++ b/gmp-6.3.0/mpn/minithres/gmp-mparam.h @@ -0,0 +1,113 @@ +/* Minimal values gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000, 2006, 2008-2010, 2012 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* The values in this file are not currently minimal. + Trimming them further would be good. */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 3 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 4 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 1 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 3 + +#define MUL_TOOM22_THRESHOLD 8 +#define MUL_TOOM33_THRESHOLD 20 +#define MUL_TOOM44_THRESHOLD 24 +#define MUL_TOOM6H_THRESHOLD 70 /* FIXME */ +#define MUL_TOOM8H_THRESHOLD 86 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 50 /* FIXME */ +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 50 /* FIXME */ +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 50 /* FIXME */ +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 50 /* FIXME */ + +#define SQR_BASECASE_THRESHOLD 0 +#define SQR_TOOM2_THRESHOLD 8 +#define SQR_TOOM3_THRESHOLD 20 +#define SQR_TOOM4_THRESHOLD 24 +#define SQR_TOOM6H_THRESHOLD 70 /* FIXME */ +#define SQR_TOOM8H_THRESHOLD 86 + +#define MULMOD_BNM1_THRESHOLD 10 +#define SQRMOD_BNM1_THRESHOLD 10 + +#define MUL_FFT_TABLE {64, 256, 1024, 4096, 8192, 65536, 0} +#define MUL_FFT_MODF_THRESHOLD 65 +#define MUL_FFT_THRESHOLD 200 + +#define SQR_FFT_TABLE {64, 256, 1024, 4096, 8192, 65536, 0} +#define SQR_FFT_MODF_THRESHOLD 65 +#define SQR_FFT_THRESHOLD 200 + +#define MULLO_BASECASE_THRESHOLD 0 +#define MULLO_DC_THRESHOLD 2 +#define MULLO_MUL_N_THRESHOLD 4 +#define SQRLO_BASECASE_THRESHOLD 0 +#define SQRLO_DC_THRESHOLD 2 +#define SQRLO_SQR_THRESHOLD 4 + + +#define DC_DIV_QR_THRESHOLD 6 +#define DC_DIVAPPR_Q_THRESHOLD 6 +#define DC_BDIV_QR_THRESHOLD 4 +#define DC_BDIV_Q_THRESHOLD 4 + +#define INV_MULMOD_BNM1_THRESHOLD 2 +#define INV_NEWTON_THRESHOLD 6 +#define INV_APPR_THRESHOLD 4 + +#define BINV_NEWTON_THRESHOLD 6 +#define REDC_1_TO_REDC_N_THRESHOLD 9 + +#define MU_DIV_QR_THRESHOLD 8 +#define MU_DIVAPPR_Q_THRESHOLD 8 +#define MUPI_DIV_QR_THRESHOLD 8 +#define MU_BDIV_QR_THRESHOLD 8 +#define MU_BDIV_Q_THRESHOLD 8 + +#define MATRIX22_STRASSEN_THRESHOLD 2 +#define HGCD_THRESHOLD 10 +#define GCD_DC_THRESHOLD 20 +#define GCDEXT_SCHOENHAGE_THRESHOLD 20 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 4 +#define GET_STR_PRECOMPUTE_THRESHOLD 10 +#define SET_STR_THRESHOLD 64 +#define SET_STR_PRECOMPUTE_THRESHOLD 100 + +#define FAC_ODD_THRESHOLD 0 /* always */ +#define FAC_DSC_THRESHOLD 70 diff --git a/gmp-6.3.0/mpn/mips32/add_n.asm b/gmp-6.3.0/mpn/mips32/add_n.asm new file mode 100644 index 0000000..e7d4c48 --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/add_n.asm @@ -0,0 +1,124 @@ +dnl MIPS32 mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 1995, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C s2_ptr $6 +C size $7 + +ASM_START() +PROLOGUE(mpn_add_n) + + lw $10,0($5) + lw $11,0($6) + + addiu $7,$7,-1 + and $9,$7,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + move $2,$0 + + subu $7,$7,$9 + +.Loop0: addiu $9,$9,-1 + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + or $2,$2,$8 + + addiu $5,$5,4 + addiu $6,$6,4 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + addiu $4,$4,4 + +.L0: beq $7,$0,.Lend + nop + +.Loop: addiu $7,$7,-4 + + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + or $2,$2,$8 + + lw $10,8($5) + addu $13,$13,$2 + lw $11,8($6) + sltu $8,$13,$2 + addu $13,$12,$13 + sltu $2,$13,$12 + sw $13,4($4) + or $2,$2,$8 + + lw $12,12($5) + addu $11,$11,$2 + lw $13,12($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,8($4) + or $2,$2,$8 + + lw $10,16($5) + addu $13,$13,$2 + lw $11,16($6) + sltu $8,$13,$2 + addu $13,$12,$13 + sltu $2,$13,$12 + sw $13,12($4) + or $2,$2,$8 + + addiu $5,$5,16 + addiu $6,$6,16 + + bne $7,$0,.Loop + addiu $4,$4,16 + +.Lend: addu $11,$11,$2 + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + j $31 + or $2,$2,$8 +EPILOGUE(mpn_add_n) diff --git a/gmp-6.3.0/mpn/mips32/addmul_1.asm b/gmp-6.3.0/mpn/mips32/addmul_1.asm new file mode 100644 index 0000000..9aa9e16 --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/addmul_1.asm @@ -0,0 +1,101 @@ +dnl MIPS32 mpn_addmul_1 -- Multiply a limb vector with a single limb and add +dnl the product to a second limb vector. + +dnl Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_addmul_1) + +C feed-in phase 0 + lw $8,0($5) + +C feed-in phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) C load new s1 limb as early as possible + +Loop: lw $10,0($4) + mflo $3 + mfhi $9 + addiu $5,$5,4 + addu $3,$3,$2 C add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) C load new s1 limb as early as possible + addiu $6,$6,-1 C decrement loop counter + sltu $2,$3,$2 C carry from previous addition -> $2 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + bne $6,$0,Loop + addu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + multu $8,$7 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + addu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + j $31 + addu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/mips32/gmp-mparam.h b/gmp-6.3.0/mpn/mips32/gmp-mparam.h new file mode 100644 index 0000000..986135d --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/gmp-mparam.h @@ -0,0 +1,72 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* Generated by tuneup.c, 2002-02-20, gcc 2.95 (R3000) */ + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 50 + +#define SQR_BASECASE_THRESHOLD 7 +#define SQR_TOOM2_THRESHOLD 57 +#define SQR_TOOM3_THRESHOLD 78 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 57 +#define POWM_THRESHOLD 78 + +#define GCD_ACCEL_THRESHOLD 3 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 19 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_THRESHOLD 309 + +#define MUL_FFT_TABLE { 496, 1056, 2176, 5632, 14336, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 624 +#define MUL_FFT_THRESHOLD 5888 + +#define SQR_FFT_TABLE { 496, 1184, 2176, 5632, 14336, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 560 +#define SQR_FFT_THRESHOLD 5376 diff --git a/gmp-6.3.0/mpn/mips32/lshift.asm b/gmp-6.3.0/mpn/mips32/lshift.asm new file mode 100644 index 0000000..6a58bb4 --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/lshift.asm @@ -0,0 +1,99 @@ +dnl MIPS32 mpn_lshift -- Left shift. + +dnl Copyright 1995, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C src_ptr $5 +C size $6 +C cnt $7 + +ASM_START() +PROLOGUE(mpn_lshift) + sll $2,$6,2 + addu $5,$5,$2 C make r5 point at end of src + lw $10,-4($5) C load first limb + subu $13,$0,$7 + addu $4,$4,$2 C make r4 point at end of res + addiu $6,$6,-1 + and $9,$6,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + srl $2,$10,$13 C compute function result + + subu $6,$6,$9 + +.Loop0: lw $3,-8($5) + addiu $4,$4,-4 + addiu $5,$5,-4 + addiu $9,$9,-1 + sll $11,$10,$7 + srl $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sw $8,0($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: lw $3,-8($5) + addiu $4,$4,-16 + addiu $6,$6,-4 + sll $11,$10,$7 + srl $12,$3,$13 + + lw $10,-12($5) + sll $14,$3,$7 + or $8,$11,$12 + sw $8,12($4) + srl $9,$10,$13 + + lw $3,-16($5) + sll $11,$10,$7 + or $8,$14,$9 + sw $8,8($4) + srl $12,$3,$13 + + lw $10,-20($5) + sll $14,$3,$7 + or $8,$11,$12 + sw $8,4($4) + srl $9,$10,$13 + + addiu $5,$5,-16 + or $8,$14,$9 + bgtz $6,.Loop + sw $8,0($4) + +.Lend: sll $8,$10,$7 + j $31 + sw $8,-4($4) +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/mips32/mips-defs.m4 b/gmp-6.3.0/mpn/mips32/mips-defs.m4 new file mode 100644 index 0000000..5fa89ec --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/mips-defs.m4 @@ -0,0 +1,80 @@ +divert(-1) + +dnl m4 macros for MIPS assembly code (both 32-bit and 64-bit). + + +dnl Copyright 2000-2002 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Usage: ASM_START() +define(`ASM_START', +m4_assert_numargs(0) +` .set noreorder + .set nomacro') + +dnl Usage: X(value) +define(`X', +m4_assert_numargs(1) +`0x$1') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) +` .text + .align 4 + .globl $1 + .ent $1 +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .end $1') + + +dnl Usage: r0 ... r31 +dnl f0 ... f31 +dnl +dnl Map register names r0 to $0, and f0 to $f0, etc. +dnl +dnl defreg() is used to protect the $ in $0 (otherwise it would represent a +dnl macro argument). Double quoting is used to protect the f0 in $f0 +dnl (otherwise it would be an infinite recursion). + +forloop(i,0,31,`defreg(`r'i,$i)') +forloop(i,0,31,`deflit(`f'i,``$f''i)') + + +dnl Usage: ASM_END() +define(`ASM_END', +m4_assert_numargs(0) +) + +divert diff --git a/gmp-6.3.0/mpn/mips32/mips.m4 b/gmp-6.3.0/mpn/mips32/mips.m4 new file mode 100644 index 0000000..8b49e57 --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/mips.m4 @@ -0,0 +1,80 @@ +divert(-1) + +dnl m4 macros for MIPS assembly code. + + +dnl Copyright 2000-2002 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Usage: ASM_START() +define(`ASM_START', +m4_assert_numargs(0) +` .set noreorder + .set nomacro') + +dnl Usage: X(value) +define(`X', +m4_assert_numargs(1) +`0x$1') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) +` .text + .align 4 + .globl $1 + .ent $1 +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .end $1') + + +dnl Usage: r0 ... r31 +dnl f0 ... f31 +dnl +dnl Map register names r0 to $0, and f0 to $f0, etc. +dnl +dnl defreg() is used to protect the $ in $0 (otherwise it would represent a +dnl macro argument). Double quoting is used to protect the f0 in $f0 +dnl (otherwise it would be an infinite recursion). + +forloop(i,0,31,`defreg(`r'i,$i)') +forloop(i,0,31,`deflit(`f'i,``$f''i)') + + +dnl Usage: ASM_END() +define(`ASM_END', +m4_assert_numargs(0) +) + +divert diff --git a/gmp-6.3.0/mpn/mips32/mul_1.asm b/gmp-6.3.0/mpn/mips32/mul_1.asm new file mode 100644 index 0000000..4337bc2 --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/mul_1.asm @@ -0,0 +1,89 @@ +dnl MIPS32 mpn_mul_1 -- Multiply a limb vector with a single limb and store +dnl the product in a second limb vector. + +dnl Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_mul_1) + +C feed-in phase 0 + lw $8,0($5) + +C feed-in phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) C load new s1 limb as early as possible + +Loop: mflo $10 + mfhi $9 + addiu $5,$5,4 + addu $10,$10,$2 C add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) C load new s1 limb as early as possible + addiu $6,$6,-1 C decrement loop counter + sltu $2,$10,$2 C carry from previous addition -> $2 + sw $10,0($4) + addiu $4,$4,4 + bne $6,$0,Loop + addu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: mflo $10 + mfhi $9 + addu $10,$10,$2 + sltu $2,$10,$2 + multu $8,$7 + sw $10,0($4) + addiu $4,$4,4 + addu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: mflo $10 + mfhi $9 + addu $10,$10,$2 + sltu $2,$10,$2 + sw $10,0($4) + j $31 + addu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/mips32/rshift.asm b/gmp-6.3.0/mpn/mips32/rshift.asm new file mode 100644 index 0000000..4b54510 --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/rshift.asm @@ -0,0 +1,96 @@ +dnl MIPS32 mpn_rshift -- Right shift. + +dnl Copyright 1995, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C src_ptr $5 +C size $6 +C cnt $7 + +ASM_START() +PROLOGUE(mpn_rshift) + lw $10,0($5) C load first limb + subu $13,$0,$7 + addiu $6,$6,-1 + and $9,$6,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + sll $2,$10,$13 C compute function result + + subu $6,$6,$9 + +.Loop0: lw $3,4($5) + addiu $4,$4,4 + addiu $5,$5,4 + addiu $9,$9,-1 + srl $11,$10,$7 + sll $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sw $8,-4($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: lw $3,4($5) + addiu $4,$4,16 + addiu $6,$6,-4 + srl $11,$10,$7 + sll $12,$3,$13 + + lw $10,8($5) + srl $14,$3,$7 + or $8,$11,$12 + sw $8,-16($4) + sll $9,$10,$13 + + lw $3,12($5) + srl $11,$10,$7 + or $8,$14,$9 + sw $8,-12($4) + sll $12,$3,$13 + + lw $10,16($5) + srl $14,$3,$7 + or $8,$11,$12 + sw $8,-8($4) + sll $9,$10,$13 + + addiu $5,$5,16 + or $8,$14,$9 + bgtz $6,.Loop + sw $8,-4($4) + +.Lend: srl $8,$10,$7 + j $31 + sw $8,0($4) +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/mips32/sub_n.asm b/gmp-6.3.0/mpn/mips32/sub_n.asm new file mode 100644 index 0000000..a962ce1 --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/sub_n.asm @@ -0,0 +1,123 @@ +dnl MIPS32 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 1995, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C s2_ptr $6 +C size $7 + +ASM_START() +PROLOGUE(mpn_sub_n) + lw $10,0($5) + lw $11,0($6) + + addiu $7,$7,-1 + and $9,$7,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + move $2,$0 + + subu $7,$7,$9 + +.Loop0: addiu $9,$9,-1 + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + or $2,$2,$8 + + addiu $5,$5,4 + addiu $6,$6,4 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + addiu $4,$4,4 + +.L0: beq $7,$0,.Lend + nop + +.Loop: addiu $7,$7,-4 + + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + or $2,$2,$8 + + lw $10,8($5) + addu $13,$13,$2 + lw $11,8($6) + sltu $8,$13,$2 + subu $13,$12,$13 + sltu $2,$12,$13 + sw $13,4($4) + or $2,$2,$8 + + lw $12,12($5) + addu $11,$11,$2 + lw $13,12($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,8($4) + or $2,$2,$8 + + lw $10,16($5) + addu $13,$13,$2 + lw $11,16($6) + sltu $8,$13,$2 + subu $13,$12,$13 + sltu $2,$12,$13 + sw $13,12($4) + or $2,$2,$8 + + addiu $5,$5,16 + addiu $6,$6,16 + + bne $7,$0,.Loop + addiu $4,$4,16 + +.Lend: addu $11,$11,$2 + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + j $31 + or $2,$2,$8 +EPILOGUE(mpn_sub_n) diff --git a/gmp-6.3.0/mpn/mips32/submul_1.asm b/gmp-6.3.0/mpn/mips32/submul_1.asm new file mode 100644 index 0000000..335722b --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/submul_1.asm @@ -0,0 +1,101 @@ +dnl MIPS32 mpn_submul_1 -- Multiply a limb vector with a single limb and +dnl subtract the product from a second limb vector. + +dnl Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_submul_1) + +C feed-in phase 0 + lw $8,0($5) + +C feed-in phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) C load new s1 limb as early as possible + +Loop: lw $10,0($4) + mflo $3 + mfhi $9 + addiu $5,$5,4 + addu $3,$3,$2 C add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) C load new s1 limb as early as possible + addiu $6,$6,-1 C decrement loop counter + sltu $2,$3,$2 C carry from previous addition -> $2 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + bne $6,$0,Loop + addu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + multu $8,$7 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + addu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + j $31 + addu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/mips32/umul.asm b/gmp-6.3.0/mpn/mips32/umul.asm new file mode 100644 index 0000000..1ced0eb --- /dev/null +++ b/gmp-6.3.0/mpn/mips32/umul.asm @@ -0,0 +1,45 @@ +dnl MIPS32 umul_ppmm -- longlong.h support. + +dnl Copyright 1999, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C plp $4 +C u $5 +C v $6 + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + multu $5,$6 + mflo $3 + mfhi $2 + j $31 + sw $3,0($4) +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/mips64/README b/gmp-6.3.0/mpn/mips64/README new file mode 100644 index 0000000..7ddd0e5 --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/README @@ -0,0 +1,60 @@ +Copyright 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions optimized for MIPS3. Example of +processors that implement MIPS3 are R4000, R4400, R4600, R4700, and R8000. + +RELEVANT OPTIMIZATION ISSUES + +1. On the R4000 and R4400, branches, both the plain and the "likely" ones, + take 3 cycles to execute. (The fastest possible loop will take 4 cycles, + because of the delay insn.) + + On the R4600, branches takes a single cycle + + On the R8000, branches often take no noticeable cycles, as they are + executed in a separate function unit.. + +2. The R4000 and R4400 have a load latency of 4 cycles. + +3. On the R4000 and R4400, multiplies take a data-dependent number of + cycles, contrary to the SGI documentation. There seem to be 3 or 4 + possible latencies. + +4. The R1x000 processors can issue one floating-point operation, two integer + operations, and one memory operation per cycle. The FPU has very short + latencies, while the integer multiply unit is non-pipelined. We should + therefore write fp based mpn_Xmul_1. + +STATUS + +Good... diff --git a/gmp-6.3.0/mpn/mips64/add_n.asm b/gmp-6.3.0/mpn/mips64/add_n.asm new file mode 100644 index 0000000..6856407 --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/add_n.asm @@ -0,0 +1,134 @@ +dnl MIPS64 mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 1995, 2000-2002, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C s2_ptr $6 +C size $7 + +ASM_START() +PROLOGUE(mpn_add_nc) + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + move $2,$8 + b .Loop0 + dsubu $7,$7,$9 +EPILOGUE() +PROLOGUE(mpn_add_n) + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + move $2,$0 + + dsubu $7,$7,$9 + +.Loop0: daddiu $9,$9,-1 + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + or $2,$2,$8 + + daddiu $5,$5,8 + daddiu $6,$6,8 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + daddiu $4,$4,8 + +.L0: beq $7,$0,.Lend + nop + +.Loop: daddiu $7,$7,-4 + + ld $12,8($5) + daddu $11,$11,$10 + ld $13,8($6) + sltu $8,$11,$10 + daddu $11,$11,$2 + sltu $2,$11,$2 + sd $11,0($4) + or $2,$2,$8 + + ld $10,16($5) + daddu $13,$13,$12 + ld $11,16($6) + sltu $8,$13,$12 + daddu $13,$13,$2 + sltu $2,$13,$2 + sd $13,8($4) + or $2,$2,$8 + + ld $12,24($5) + daddu $11,$11,$10 + ld $13,24($6) + sltu $8,$11,$10 + daddu $11,$11,$2 + sltu $2,$11,$2 + sd $11,16($4) + or $2,$2,$8 + + ld $10,32($5) + daddu $13,$13,$12 + ld $11,32($6) + sltu $8,$13,$12 + daddu $13,$13,$2 + sltu $2,$13,$2 + sd $13,24($4) + or $2,$2,$8 + + daddiu $5,$5,32 + daddiu $6,$6,32 + + bne $7,$0,.Loop + daddiu $4,$4,32 + +.Lend: daddu $11,$11,$2 + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + j $31 + or $2,$2,$8 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/mips64/gmp-mparam.h b/gmp-6.3.0/mpn/mips64/gmp-mparam.h new file mode 100644 index 0000000..b7fcf24 --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/gmp-mparam.h @@ -0,0 +1,72 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + + +/* Generated by tuneup.c, 2004-02-10, gcc 3.2 & MIPSpro C 7.2.1 (R1x000) */ + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 89 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 98 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 53 +#define POWM_THRESHOLD 61 + +#define HGCD_THRESHOLD 116 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 492 +#define JACOBI_BASE_METHOD 2 + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 21 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_THRESHOLD 3962 + +#define MUL_FFT_TABLE { 368, 736, 1600, 3328, 7168, 20480, 49152, 0 } +#define MUL_FFT_MODF_THRESHOLD 264 +#define MUL_FFT_THRESHOLD 1920 + +#define SQR_FFT_TABLE { 368, 736, 1856, 3328, 7168, 20480, 49152, 0 } +#define SQR_FFT_MODF_THRESHOLD 280 +#define SQR_FFT_THRESHOLD 1920 diff --git a/gmp-6.3.0/mpn/mips64/hilo/addmul_1.asm b/gmp-6.3.0/mpn/mips64/hilo/addmul_1.asm new file mode 100644 index 0000000..8ff0976 --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/hilo/addmul_1.asm @@ -0,0 +1,101 @@ +dnl MIPS64 mpn_addmul_1 -- Multiply a limb vector with a single limb and add +dnl the product to a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_addmul_1) + +C feed-in phase 0 + ld $8,0($5) + +C feed-in phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) C load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 C add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) C load new s1 limb as early as possible + daddiu $6,$6,-1 C decrement loop counter + sltu $2,$3,$2 C carry from previous addition -> $2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/mips64/hilo/mul_1.asm b/gmp-6.3.0/mpn/mips64/hilo/mul_1.asm new file mode 100644 index 0000000..77acf0a --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/hilo/mul_1.asm @@ -0,0 +1,92 @@ +dnl MIPS64 mpn_mul_1 -- Multiply a limb vector with a single limb and store +dnl the product in a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_mul_1) + +C feed-in phase 0 + ld $8,0($5) + +C feed-in phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) C load new s1 limb as early as possible + +Loop: nop + mflo $10 + mfhi $9 + daddiu $5,$5,8 + daddu $10,$10,$2 C add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) C load new s1 limb as early as possible + daddiu $6,$6,-1 C decrement loop counter + sltu $2,$10,$2 C carry from previous addition -> $2 + nop + nop + sd $10,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + dmultu $8,$7 + sd $10,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + sd $10,0($4) + j $31 + daddu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/mips64/hilo/sqr_diagonal.asm b/gmp-6.3.0/mpn/mips64/hilo/sqr_diagonal.asm new file mode 100644 index 0000000..dcb87dc --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/hilo/sqr_diagonal.asm @@ -0,0 +1,77 @@ +dnl MIPS64 mpn_sqr_diagonal. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl rp $4 +dnl up $5 +dnl n $6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + ld r8,0(r5) + daddiu r6,r6,-2 + dmultu r8,r8 + bltz r6,$Lend1 + nop + ld r8,8(r5) + beq r6,r0,$Lend2 + nop + +$Loop: mflo r10 + mfhi r9 + daddiu r6,r6,-1 + sd r10,0(r4) + sd r9,8(r4) + dmultu r8,r8 + ld r8,16(r5) + daddiu r5,r5,8 + bne r6,r0,$Loop + daddiu r4,r4,16 + +$Lend2: mflo r10 + mfhi r9 + sd r10,0(r4) + sd r9,8(r4) + dmultu r8,r8 + mflo r10 + mfhi r9 + sd r10,16(r4) + j r31 + sd r9,24(r4) + +$Lend1: mflo r10 + mfhi r9 + sd r10,0(r4) + j r31 + sd r9,8(r4) +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/mips64/hilo/submul_1.asm b/gmp-6.3.0/mpn/mips64/hilo/submul_1.asm new file mode 100644 index 0000000..089589c --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/hilo/submul_1.asm @@ -0,0 +1,101 @@ +dnl MIPS64 mpn_submul_1 -- Multiply a limb vector with a single limb and +dnl subtract the product from a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_submul_1) + +C feed-in phase 0 + ld $8,0($5) + +C feed-in phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) C load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 C add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) C load new s1 limb as early as possible + daddiu $6,$6,-1 C decrement loop counter + sltu $2,$3,$2 C carry from previous addition -> $2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/mips64/hilo/umul.asm b/gmp-6.3.0/mpn/mips64/hilo/umul.asm new file mode 100644 index 0000000..b9aac57 --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/hilo/umul.asm @@ -0,0 +1,45 @@ +dnl MIPS64 umul_ppmm -- longlong.h support. + +dnl Copyright 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C plp $4 +C u $5 +C v $6 + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + dmultu $5,$6 + mflo $3 + mfhi $2 + j $31 + sd $3,0($4) +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/mips64/lshift.asm b/gmp-6.3.0/mpn/mips64/lshift.asm new file mode 100644 index 0000000..3440eaf --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/lshift.asm @@ -0,0 +1,99 @@ +dnl MIPS64 mpn_lshift -- Left shift. + +dnl Copyright 1995, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C src_ptr $5 +C size $6 +C cnt $7 + +ASM_START() +PROLOGUE(mpn_lshift) + dsll $2,$6,3 + daddu $5,$5,$2 C make r5 point at end of src + ld $10,-8($5) C load first limb + dsubu $13,$0,$7 + daddu $4,$4,$2 C make r4 point at end of res + daddiu $6,$6,-1 + and $9,$6,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + dsrl $2,$10,$13 C compute function result + + dsubu $6,$6,$9 + +.Loop0: ld $3,-16($5) + daddiu $4,$4,-8 + daddiu $5,$5,-8 + daddiu $9,$9,-1 + dsll $11,$10,$7 + dsrl $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sd $8,0($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: ld $3,-16($5) + daddiu $4,$4,-32 + daddiu $6,$6,-4 + dsll $11,$10,$7 + dsrl $12,$3,$13 + + ld $10,-24($5) + dsll $14,$3,$7 + or $8,$11,$12 + sd $8,24($4) + dsrl $9,$10,$13 + + ld $3,-32($5) + dsll $11,$10,$7 + or $8,$14,$9 + sd $8,16($4) + dsrl $12,$3,$13 + + ld $10,-40($5) + dsll $14,$3,$7 + or $8,$11,$12 + sd $8,8($4) + dsrl $9,$10,$13 + + daddiu $5,$5,-32 + or $8,$14,$9 + bgtz $6,.Loop + sd $8,0($4) + +.Lend: dsll $8,$10,$7 + j $31 + sd $8,-8($4) +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/mips64/rshift.asm b/gmp-6.3.0/mpn/mips64/rshift.asm new file mode 100644 index 0000000..9253cb5 --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/rshift.asm @@ -0,0 +1,96 @@ +dnl MIPS64 mpn_rshift -- Right shift. + +dnl Copyright 1995, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C src_ptr $5 +C size $6 +C cnt $7 + +ASM_START() +PROLOGUE(mpn_rshift) + ld $10,0($5) C load first limb + dsubu $13,$0,$7 + daddiu $6,$6,-1 + and $9,$6,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + dsll $2,$10,$13 C compute function result + + dsubu $6,$6,$9 + +.Loop0: ld $3,8($5) + daddiu $4,$4,8 + daddiu $5,$5,8 + daddiu $9,$9,-1 + dsrl $11,$10,$7 + dsll $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sd $8,-8($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: ld $3,8($5) + daddiu $4,$4,32 + daddiu $6,$6,-4 + dsrl $11,$10,$7 + dsll $12,$3,$13 + + ld $10,16($5) + dsrl $14,$3,$7 + or $8,$11,$12 + sd $8,-32($4) + dsll $9,$10,$13 + + ld $3,24($5) + dsrl $11,$10,$7 + or $8,$14,$9 + sd $8,-24($4) + dsll $12,$3,$13 + + ld $10,32($5) + dsrl $14,$3,$7 + or $8,$11,$12 + sd $8,-16($4) + dsll $9,$10,$13 + + daddiu $5,$5,32 + or $8,$14,$9 + bgtz $6,.Loop + sd $8,-8($4) + +.Lend: dsrl $8,$10,$7 + j $31 + sd $8,0($4) +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/mips64/sub_n.asm b/gmp-6.3.0/mpn/mips64/sub_n.asm new file mode 100644 index 0000000..6a69897 --- /dev/null +++ b/gmp-6.3.0/mpn/mips64/sub_n.asm @@ -0,0 +1,134 @@ +dnl MIPS64 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 1995, 2000-2002, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C s2_ptr $6 +C size $7 + +ASM_START() +PROLOGUE(mpn_sub_nc) + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + move $2,$8 + b .Loop0 + dsubu $7,$7,$9 +EPILOGUE() +PROLOGUE(mpn_sub_n) + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 C number of limbs in first loop + beq $9,$0,.L0 C if multiple of 4 limbs, skip first loop + move $2,$0 + + dsubu $7,$7,$9 + +.Loop0: daddiu $9,$9,-1 + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + or $2,$2,$8 + + daddiu $5,$5,8 + daddiu $6,$6,8 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + daddiu $4,$4,8 + +.L0: beq $7,$0,.Lend + nop + +.Loop: daddiu $7,$7,-4 + + ld $12,8($5) + dsubu $11,$10,$11 + ld $13,8($6) + sltu $8,$10,$11 + dsubu $14,$11,$2 + sltu $2,$11,$14 + sd $14,0($4) + or $2,$2,$8 + + ld $10,16($5) + dsubu $13,$12,$13 + ld $11,16($6) + sltu $8,$12,$13 + dsubu $14,$13,$2 + sltu $2,$13,$14 + sd $14,8($4) + or $2,$2,$8 + + ld $12,24($5) + dsubu $11,$10,$11 + ld $13,24($6) + sltu $8,$10,$11 + dsubu $14,$11,$2 + sltu $2,$11,$14 + sd $14,16($4) + or $2,$2,$8 + + ld $10,32($5) + dsubu $13,$12,$13 + ld $11,32($6) + sltu $8,$12,$13 + dsubu $14,$13,$2 + sltu $2,$13,$14 + sd $14,24($4) + or $2,$2,$8 + + daddiu $5,$5,32 + daddiu $6,$6,32 + + bne $7,$0,.Loop + daddiu $4,$4,32 + +.Lend: daddu $11,$11,$2 + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + j $31 + or $2,$2,$8 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/mod_1.c b/gmp-6.3.0/mpn/mod_1.c new file mode 120000 index 0000000..de3e5a1 --- /dev/null +++ b/gmp-6.3.0/mpn/mod_1.c @@ -0,0 +1 @@ +../mpn/generic/mod_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mod_1_1.asm b/gmp-6.3.0/mpn/mod_1_1.asm new file mode 120000 index 0000000..c4cc9aa --- /dev/null +++ b/gmp-6.3.0/mpn/mod_1_1.asm @@ -0,0 +1 @@ +../mpn/x86/p6/sse2/mod_1_1.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mod_1_2.c b/gmp-6.3.0/mpn/mod_1_2.c new file mode 120000 index 0000000..2c5b8c4 --- /dev/null +++ b/gmp-6.3.0/mpn/mod_1_2.c @@ -0,0 +1 @@ +../mpn/generic/mod_1_2.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mod_1_3.c b/gmp-6.3.0/mpn/mod_1_3.c new file mode 120000 index 0000000..c742d14 --- /dev/null +++ b/gmp-6.3.0/mpn/mod_1_3.c @@ -0,0 +1 @@ +../mpn/generic/mod_1_3.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mod_1_4.asm b/gmp-6.3.0/mpn/mod_1_4.asm new file mode 120000 index 0000000..f4a02e5 --- /dev/null +++ b/gmp-6.3.0/mpn/mod_1_4.asm @@ -0,0 +1 @@ +../mpn/x86/p6/sse2/mod_1_4.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mod_34lsub1.asm b/gmp-6.3.0/mpn/mod_34lsub1.asm new file mode 120000 index 0000000..f6e9823 --- /dev/null +++ b/gmp-6.3.0/mpn/mod_34lsub1.asm @@ -0,0 +1 @@ +../mpn/x86/p6/mod_34lsub1.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mode1o.asm b/gmp-6.3.0/mpn/mode1o.asm new file mode 120000 index 0000000..e7ae75e --- /dev/null +++ b/gmp-6.3.0/mpn/mode1o.asm @@ -0,0 +1 @@ +../mpn/x86/p6/mode1o.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mp_bases.c b/gmp-6.3.0/mpn/mp_bases.c new file mode 100644 index 0000000..ad1279a --- /dev/null +++ b/gmp-6.3.0/mpn/mp_bases.c @@ -0,0 +1,268 @@ +/* This file generated by gen-bases.c - DO NOT EDIT. */ + +#include "gmp-impl.h" + +#if GMP_NUMB_BITS != 32 +Error, error, this data is for 32 bits +#endif + +const struct bases mp_bases[257] = +{ + /* 0 */ { 0, 0, 0, 0, 0 }, + /* 1 */ { 0, 0, 0, 0, 0 }, + /* 2 */ { 32, CNST_LIMB(0xffffffff), CNST_LIMB(0x1fffffff), CNST_LIMB(0x1), CNST_LIMB(0x0) }, + /* 3 */ { 20, CNST_LIMB(0xa1849cc1), CNST_LIMB(0x32b80347), CNST_LIMB(0xcfd41b91), CNST_LIMB(0x3b563c24) }, + /* 4 */ { 16, CNST_LIMB(0x7fffffff), CNST_LIMB(0x3fffffff), CNST_LIMB(0x2), CNST_LIMB(0x0) }, + /* 5 */ { 13, CNST_LIMB(0x6e40d1a4), CNST_LIMB(0x4a4d3c25), CNST_LIMB(0x48c27395), CNST_LIMB(0xc25c2684) }, + /* 6 */ { 12, CNST_LIMB(0x6308c91b), CNST_LIMB(0x52b80347), CNST_LIMB(0x81bf1000), CNST_LIMB(0xf91bd1b6) }, + /* 7 */ { 11, CNST_LIMB(0x5b3064eb), CNST_LIMB(0x59d5d9fd), CNST_LIMB(0x75db9c97), CNST_LIMB(0x1607a2cb) }, + /* 8 */ { 10, CNST_LIMB(0x55555555), CNST_LIMB(0x5fffffff), CNST_LIMB(0x3), CNST_LIMB(0x0) }, + /* 9 */ { 10, CNST_LIMB(0x50c24e60), CNST_LIMB(0x6570068e), CNST_LIMB(0xcfd41b91), CNST_LIMB(0x3b563c24) }, + /* 10 */ { 9, CNST_LIMB(0x4d104d42), CNST_LIMB(0x6a4d3c25), CNST_LIMB(0x3b9aca00), CNST_LIMB(0x12e0be82) }, + /* 11 */ { 9, CNST_LIMB(0x4a002707), CNST_LIMB(0x6eb3a9f0), CNST_LIMB(0x8c8b6d2b), CNST_LIMB(0xd24cde04) }, + /* 12 */ { 8, CNST_LIMB(0x4768ce0d), CNST_LIMB(0x72b80347), CNST_LIMB(0x19a10000), CNST_LIMB(0x3fa39ab5) }, + /* 13 */ { 8, CNST_LIMB(0x452e53e3), CNST_LIMB(0x766a008e), CNST_LIMB(0x309f1021), CNST_LIMB(0x50f8ac5f) }, + /* 14 */ { 8, CNST_LIMB(0x433cfffb), CNST_LIMB(0x79d5d9fd), CNST_LIMB(0x57f6c100), CNST_LIMB(0x74843b1e) }, + /* 15 */ { 8, CNST_LIMB(0x41867711), CNST_LIMB(0x7d053f6d), CNST_LIMB(0x98c29b81), CNST_LIMB(0xad0326c2) }, + /* 16 */ { 8, CNST_LIMB(0x3fffffff), CNST_LIMB(0x7fffffff), CNST_LIMB(0x4), CNST_LIMB(0x0) }, + /* 17 */ { 7, CNST_LIMB(0x3ea16afd), CNST_LIMB(0x82cc7edf), CNST_LIMB(0x18754571), CNST_LIMB(0x4ef0b6bd) }, + /* 18 */ { 7, CNST_LIMB(0x3d64598d), CNST_LIMB(0x8570068e), CNST_LIMB(0x247dbc80), CNST_LIMB(0xc0fc48a1) }, + /* 19 */ { 7, CNST_LIMB(0x3c43c230), CNST_LIMB(0x87ef05ae), CNST_LIMB(0x3547667b), CNST_LIMB(0x33838942) }, + /* 20 */ { 7, CNST_LIMB(0x3b3b9a42), CNST_LIMB(0x8a4d3c25), CNST_LIMB(0x4c4b4000), CNST_LIMB(0xad7f29ab) }, + /* 21 */ { 7, CNST_LIMB(0x3a4898f0), CNST_LIMB(0x8c8ddd44), CNST_LIMB(0x6b5a6e1d), CNST_LIMB(0x313c3d15) }, + /* 22 */ { 7, CNST_LIMB(0x39680b13), CNST_LIMB(0x8eb3a9f0), CNST_LIMB(0x94ace180), CNST_LIMB(0xb8cca9e0) }, + /* 23 */ { 7, CNST_LIMB(0x3897b2b7), CNST_LIMB(0x90c10500), CNST_LIMB(0xcaf18367), CNST_LIMB(0x42ed6de9) }, + /* 24 */ { 6, CNST_LIMB(0x37d5aed1), CNST_LIMB(0x92b80347), CNST_LIMB(0xb640000), CNST_LIMB(0x67980e0b) }, + /* 25 */ { 6, CNST_LIMB(0x372068d2), CNST_LIMB(0x949a784b), CNST_LIMB(0xe8d4a51), CNST_LIMB(0x19799812) }, + /* 26 */ { 6, CNST_LIMB(0x3676867e), CNST_LIMB(0x966a008e), CNST_LIMB(0x1269ae40), CNST_LIMB(0xbce85396) }, + /* 27 */ { 6, CNST_LIMB(0x35d6deeb), CNST_LIMB(0x982809d5), CNST_LIMB(0x17179149), CNST_LIMB(0x62c103a9) }, + /* 28 */ { 6, CNST_LIMB(0x354071d6), CNST_LIMB(0x99d5d9fd), CNST_LIMB(0x1cb91000), CNST_LIMB(0x1d353d43) }, + /* 29 */ { 6, CNST_LIMB(0x34b260c5), CNST_LIMB(0x9b74948f), CNST_LIMB(0x23744899), CNST_LIMB(0xce1decea) }, + /* 30 */ { 6, CNST_LIMB(0x342be986), CNST_LIMB(0x9d053f6d), CNST_LIMB(0x2b73a840), CNST_LIMB(0x790fc511) }, + /* 31 */ { 6, CNST_LIMB(0x33ac61b9), CNST_LIMB(0x9e88c6b3), CNST_LIMB(0x34e63b41), CNST_LIMB(0x35b865a0) }, + /* 32 */ { 6, CNST_LIMB(0x33333333), CNST_LIMB(0x9fffffff), CNST_LIMB(0x5), CNST_LIMB(0x0) }, + /* 33 */ { 6, CNST_LIMB(0x32bfd901), CNST_LIMB(0xa16bad37), CNST_LIMB(0x4cfa3cc1), CNST_LIMB(0xa9aed1b3) }, + /* 34 */ { 6, CNST_LIMB(0x3251dcf6), CNST_LIMB(0xa2cc7edf), CNST_LIMB(0x5c13d840), CNST_LIMB(0x63dfc229) }, + /* 35 */ { 6, CNST_LIMB(0x31e8d59f), CNST_LIMB(0xa4231623), CNST_LIMB(0x6d91b519), CNST_LIMB(0x2b0fee30) }, + /* 36 */ { 6, CNST_LIMB(0x3184648d), CNST_LIMB(0xa570068e), CNST_LIMB(0x81bf1000), CNST_LIMB(0xf91bd1b6) }, + /* 37 */ { 6, CNST_LIMB(0x312434e8), CNST_LIMB(0xa6b3d78b), CNST_LIMB(0x98ede0c9), CNST_LIMB(0xac89c3a9) }, + /* 38 */ { 6, CNST_LIMB(0x30c7fa34), CNST_LIMB(0xa7ef05ae), CNST_LIMB(0xb3773e40), CNST_LIMB(0x6d2c32fe) }, + /* 39 */ { 6, CNST_LIMB(0x306f6f4c), CNST_LIMB(0xa92203d5), CNST_LIMB(0xd1bbc4d1), CNST_LIMB(0x387907c9) }, + /* 40 */ { 6, CNST_LIMB(0x301a557f), CNST_LIMB(0xaa4d3c25), CNST_LIMB(0xf4240000), CNST_LIMB(0xc6f7a0b) }, + /* 41 */ { 5, CNST_LIMB(0x2fc873d1), CNST_LIMB(0xab7110e6), CNST_LIMB(0x6e7d349), CNST_LIMB(0x28928154) }, + /* 42 */ { 5, CNST_LIMB(0x2f799652), CNST_LIMB(0xac8ddd44), CNST_LIMB(0x7ca30a0), CNST_LIMB(0x6e8629d) }, + /* 43 */ { 5, CNST_LIMB(0x2f2d8d8f), CNST_LIMB(0xada3f5fb), CNST_LIMB(0x8c32bbb), CNST_LIMB(0xd373dca0) }, + /* 44 */ { 5, CNST_LIMB(0x2ee42e16), CNST_LIMB(0xaeb3a9f0), CNST_LIMB(0x9d46c00), CNST_LIMB(0xa0b17895) }, + /* 45 */ { 5, CNST_LIMB(0x2e9d5009), CNST_LIMB(0xafbd42b4), CNST_LIMB(0xaffacfd), CNST_LIMB(0x746811a5) }, + /* 46 */ { 5, CNST_LIMB(0x2e58cec0), CNST_LIMB(0xb0c10500), CNST_LIMB(0xc46bee0), CNST_LIMB(0x4da6500f) }, + /* 47 */ { 5, CNST_LIMB(0x2e168874), CNST_LIMB(0xb1bf311e), CNST_LIMB(0xdab86ef), CNST_LIMB(0x2ba23582) }, + /* 48 */ { 5, CNST_LIMB(0x2dd65df7), CNST_LIMB(0xb2b80347), CNST_LIMB(0xf300000), CNST_LIMB(0xdb20a88) }, + /* 49 */ { 5, CNST_LIMB(0x2d983275), CNST_LIMB(0xb3abb3fa), CNST_LIMB(0x10d63af1), CNST_LIMB(0xe68d5ce4) }, + /* 50 */ { 5, CNST_LIMB(0x2d5beb38), CNST_LIMB(0xb49a784b), CNST_LIMB(0x12a05f20), CNST_LIMB(0xb7cdfd9d) }, + /* 51 */ { 5, CNST_LIMB(0x2d216f79), CNST_LIMB(0xb5848226), CNST_LIMB(0x1490aae3), CNST_LIMB(0x8e583933) }, + /* 52 */ { 5, CNST_LIMB(0x2ce8a82e), CNST_LIMB(0xb66a008e), CNST_LIMB(0x16a97400), CNST_LIMB(0x697cc3ea) }, + /* 53 */ { 5, CNST_LIMB(0x2cb17fea), CNST_LIMB(0xb74b1fd6), CNST_LIMB(0x18ed2825), CNST_LIMB(0x48a5ca6c) }, + /* 54 */ { 5, CNST_LIMB(0x2c7be2b0), CNST_LIMB(0xb82809d5), CNST_LIMB(0x1b5e4d60), CNST_LIMB(0x2b52db16) }, + /* 55 */ { 5, CNST_LIMB(0x2c47bddb), CNST_LIMB(0xb900e615), CNST_LIMB(0x1dff8297), CNST_LIMB(0x111586a6) }, + /* 56 */ { 5, CNST_LIMB(0x2c14fffc), CNST_LIMB(0xb9d5d9fd), CNST_LIMB(0x20d38000), CNST_LIMB(0xf31d2b36) }, + /* 57 */ { 5, CNST_LIMB(0x2be398c3), CNST_LIMB(0xbaa708f5), CNST_LIMB(0x23dd1799), CNST_LIMB(0xc8d76d19) }, + /* 58 */ { 5, CNST_LIMB(0x2bb378e7), CNST_LIMB(0xbb74948f), CNST_LIMB(0x271f35a0), CNST_LIMB(0xa2cb1eb4) }, + /* 59 */ { 5, CNST_LIMB(0x2b849210), CNST_LIMB(0xbc3e9ca2), CNST_LIMB(0x2a9ce10b), CNST_LIMB(0x807c3ec3) }, + /* 60 */ { 5, CNST_LIMB(0x2b56d6c7), CNST_LIMB(0xbd053f6d), CNST_LIMB(0x2e593c00), CNST_LIMB(0x617ec8bf) }, + /* 61 */ { 5, CNST_LIMB(0x2b2a3a60), CNST_LIMB(0xbdc899ab), CNST_LIMB(0x3257844d), CNST_LIMB(0x45746cbe) }, + /* 62 */ { 5, CNST_LIMB(0x2afeb0f1), CNST_LIMB(0xbe88c6b3), CNST_LIMB(0x369b13e0), CNST_LIMB(0x2c0aa273) }, + /* 63 */ { 5, CNST_LIMB(0x2ad42f3c), CNST_LIMB(0xbf45e08b), CNST_LIMB(0x3b27613f), CNST_LIMB(0x14f90805) }, + /* 64 */ { 5, CNST_LIMB(0x2aaaaaaa), CNST_LIMB(0xbfffffff), CNST_LIMB(0x6), CNST_LIMB(0x0) }, + /* 65 */ { 5, CNST_LIMB(0x2a82193a), CNST_LIMB(0xc0b73cb4), CNST_LIMB(0x4528a141), CNST_LIMB(0xd9cf0829) }, + /* 66 */ { 5, CNST_LIMB(0x2a5a7176), CNST_LIMB(0xc16bad37), CNST_LIMB(0x4aa51420), CNST_LIMB(0xb6fc4841) }, + /* 67 */ { 5, CNST_LIMB(0x2a33aa6e), CNST_LIMB(0xc21d6713), CNST_LIMB(0x50794633), CNST_LIMB(0x973054cb) }, + /* 68 */ { 5, CNST_LIMB(0x2a0dbbaa), CNST_LIMB(0xc2cc7edf), CNST_LIMB(0x56a94400), CNST_LIMB(0x7a1dbe4b) }, + /* 69 */ { 5, CNST_LIMB(0x29e89d24), CNST_LIMB(0xc3790848), CNST_LIMB(0x5d393975), CNST_LIMB(0x5f7fcd7f) }, + /* 70 */ { 5, CNST_LIMB(0x29c44740), CNST_LIMB(0xc4231623), CNST_LIMB(0x642d7260), CNST_LIMB(0x47196c84) }, + /* 71 */ { 5, CNST_LIMB(0x29a0b2c7), CNST_LIMB(0xc4caba78), CNST_LIMB(0x6b8a5ae7), CNST_LIMB(0x30b43635) }, + /* 72 */ { 5, CNST_LIMB(0x297dd8db), CNST_LIMB(0xc570068e), CNST_LIMB(0x73548000), CNST_LIMB(0x1c1fa5f6) }, + /* 73 */ { 5, CNST_LIMB(0x295bb2f9), CNST_LIMB(0xc6130af4), CNST_LIMB(0x7b908fe9), CNST_LIMB(0x930634a) }, + /* 74 */ { 5, CNST_LIMB(0x293a3aeb), CNST_LIMB(0xc6b3d78b), CNST_LIMB(0x84435aa0), CNST_LIMB(0xef7f4a3c) }, + /* 75 */ { 5, CNST_LIMB(0x29196acc), CNST_LIMB(0xc7527b93), CNST_LIMB(0x8d71d25b), CNST_LIMB(0xcf5552d2) }, + /* 76 */ { 5, CNST_LIMB(0x28f93cfb), CNST_LIMB(0xc7ef05ae), CNST_LIMB(0x97210c00), CNST_LIMB(0xb1a47c8e) }, + /* 77 */ { 5, CNST_LIMB(0x28d9ac1b), CNST_LIMB(0xc88983ed), CNST_LIMB(0xa1563f9d), CNST_LIMB(0x9634b43e) }, + /* 78 */ { 5, CNST_LIMB(0x28bab310), CNST_LIMB(0xc92203d5), CNST_LIMB(0xac16c8e0), CNST_LIMB(0x7cd3817d) }, + /* 79 */ { 5, CNST_LIMB(0x289c4cf8), CNST_LIMB(0xc9b89267), CNST_LIMB(0xb768278f), CNST_LIMB(0x65536761) }, + /* 80 */ { 5, CNST_LIMB(0x287e7529), CNST_LIMB(0xca4d3c25), CNST_LIMB(0xc3500000), CNST_LIMB(0x4f8b588e) }, + /* 81 */ { 5, CNST_LIMB(0x28612730), CNST_LIMB(0xcae00d1c), CNST_LIMB(0xcfd41b91), CNST_LIMB(0x3b563c24) }, + /* 82 */ { 5, CNST_LIMB(0x28445ec9), CNST_LIMB(0xcb7110e6), CNST_LIMB(0xdcfa6920), CNST_LIMB(0x28928154) }, + /* 83 */ { 5, CNST_LIMB(0x282817e1), CNST_LIMB(0xcc0052b1), CNST_LIMB(0xeac8fd83), CNST_LIMB(0x1721bfb0) }, + /* 84 */ { 5, CNST_LIMB(0x280c4e90), CNST_LIMB(0xcc8ddd44), CNST_LIMB(0xf9461400), CNST_LIMB(0x6e8629d) }, + /* 85 */ { 4, CNST_LIMB(0x27f0ff1b), CNST_LIMB(0xcd19bb05), CNST_LIMB(0x31c84b1), CNST_LIMB(0x491cc17c) }, + /* 86 */ { 4, CNST_LIMB(0x27d625ec), CNST_LIMB(0xcda3f5fb), CNST_LIMB(0x342ab10), CNST_LIMB(0x3a11d83b) }, + /* 87 */ { 4, CNST_LIMB(0x27bbbf95), CNST_LIMB(0xce2c97d6), CNST_LIMB(0x36a2c21), CNST_LIMB(0x2be074cd) }, + /* 88 */ { 4, CNST_LIMB(0x27a1c8c8), CNST_LIMB(0xceb3a9f0), CNST_LIMB(0x3931000), CNST_LIMB(0x1e7a02e7) }, + /* 89 */ { 4, CNST_LIMB(0x27883e5e), CNST_LIMB(0xcf393550), CNST_LIMB(0x3bd5ee1), CNST_LIMB(0x11d10edd) }, + /* 90 */ { 4, CNST_LIMB(0x276f1d4c), CNST_LIMB(0xcfbd42b4), CNST_LIMB(0x3e92110), CNST_LIMB(0x5d92c68) }, + /* 91 */ { 4, CNST_LIMB(0x275662a8), CNST_LIMB(0xd03fda8b), CNST_LIMB(0x4165ef1), CNST_LIMB(0xf50dbfb2) }, + /* 92 */ { 4, CNST_LIMB(0x273e0ba3), CNST_LIMB(0xd0c10500), CNST_LIMB(0x4452100), CNST_LIMB(0xdf9f1316) }, + /* 93 */ { 4, CNST_LIMB(0x2726158c), CNST_LIMB(0xd140c9fa), CNST_LIMB(0x4756fd1), CNST_LIMB(0xcb52a684) }, + /* 94 */ { 4, CNST_LIMB(0x270e7dc9), CNST_LIMB(0xd1bf311e), CNST_LIMB(0x4a75410), CNST_LIMB(0xb8163e97) }, + /* 95 */ { 4, CNST_LIMB(0x26f741dd), CNST_LIMB(0xd23c41d4), CNST_LIMB(0x4dad681), CNST_LIMB(0xa5d8f269) }, + /* 96 */ { 4, CNST_LIMB(0x26e05f5f), CNST_LIMB(0xd2b80347), CNST_LIMB(0x5100000), CNST_LIMB(0x948b0fcd) }, + /* 97 */ { 4, CNST_LIMB(0x26c9d3fe), CNST_LIMB(0xd3327c6a), CNST_LIMB(0x546d981), CNST_LIMB(0x841e0215) }, + /* 98 */ { 4, CNST_LIMB(0x26b39d7f), CNST_LIMB(0xd3abb3fa), CNST_LIMB(0x57f6c10), CNST_LIMB(0x74843b1e) }, + /* 99 */ { 4, CNST_LIMB(0x269db9bc), CNST_LIMB(0xd423b07e), CNST_LIMB(0x5b9c0d1), CNST_LIMB(0x65b11e6e) }, + /* 100 */ { 4, CNST_LIMB(0x268826a1), CNST_LIMB(0xd49a784b), CNST_LIMB(0x5f5e100), CNST_LIMB(0x5798ee23) }, + /* 101 */ { 4, CNST_LIMB(0x2672e22d), CNST_LIMB(0xd5101187), CNST_LIMB(0x633d5f1), CNST_LIMB(0x4a30b99b) }, + /* 102 */ { 4, CNST_LIMB(0x265dea72), CNST_LIMB(0xd5848226), CNST_LIMB(0x673a910), CNST_LIMB(0x3d6e4d94) }, + /* 103 */ { 4, CNST_LIMB(0x26493d93), CNST_LIMB(0xd5f7cff4), CNST_LIMB(0x6b563e1), CNST_LIMB(0x314825b0) }, + /* 104 */ { 4, CNST_LIMB(0x2634d9c2), CNST_LIMB(0xd66a008e), CNST_LIMB(0x6f91000), CNST_LIMB(0x25b55f2e) }, + /* 105 */ { 4, CNST_LIMB(0x2620bd41), CNST_LIMB(0xd6db196a), CNST_LIMB(0x73eb721), CNST_LIMB(0x1aadaccb) }, + /* 106 */ { 4, CNST_LIMB(0x260ce662), CNST_LIMB(0xd74b1fd6), CNST_LIMB(0x7866310), CNST_LIMB(0x10294ba2) }, + /* 107 */ { 4, CNST_LIMB(0x25f95385), CNST_LIMB(0xd7ba18f9), CNST_LIMB(0x7d01db1), CNST_LIMB(0x620f8f6) }, + /* 108 */ { 4, CNST_LIMB(0x25e60316), CNST_LIMB(0xd82809d5), CNST_LIMB(0x81bf100), CNST_LIMB(0xf91bd1b6) }, + /* 109 */ { 4, CNST_LIMB(0x25d2f390), CNST_LIMB(0xd894f74b), CNST_LIMB(0x869e711), CNST_LIMB(0xe6d37b2a) }, + /* 110 */ { 4, CNST_LIMB(0x25c02379), CNST_LIMB(0xd900e615), CNST_LIMB(0x8ba0a10), CNST_LIMB(0xd55cff6e) }, + /* 111 */ { 4, CNST_LIMB(0x25ad9165), CNST_LIMB(0xd96bdad2), CNST_LIMB(0x90c6441), CNST_LIMB(0xc4ad2db2) }, + /* 112 */ { 4, CNST_LIMB(0x259b3bf3), CNST_LIMB(0xd9d5d9fd), CNST_LIMB(0x9610000), CNST_LIMB(0xb4b985cf) }, + /* 113 */ { 4, CNST_LIMB(0x258921cb), CNST_LIMB(0xda3ee7f3), CNST_LIMB(0x9b7e7c1), CNST_LIMB(0xa5782bef) }, + /* 114 */ { 4, CNST_LIMB(0x257741a2), CNST_LIMB(0xdaa708f5), CNST_LIMB(0xa112610), CNST_LIMB(0x96dfdd2a) }, + /* 115 */ { 4, CNST_LIMB(0x25659a37), CNST_LIMB(0xdb0e4126), CNST_LIMB(0xa6cc591), CNST_LIMB(0x88e7e509) }, + /* 116 */ { 4, CNST_LIMB(0x25542a50), CNST_LIMB(0xdb74948f), CNST_LIMB(0xacad100), CNST_LIMB(0x7b8813d3) }, + /* 117 */ { 4, CNST_LIMB(0x2542f0c2), CNST_LIMB(0xdbda071c), CNST_LIMB(0xb2b5331), CNST_LIMB(0x6eb8b595) }, + /* 118 */ { 4, CNST_LIMB(0x2531ec64), CNST_LIMB(0xdc3e9ca2), CNST_LIMB(0xb8e5710), CNST_LIMB(0x627289db) }, + /* 119 */ { 4, CNST_LIMB(0x25211c1c), CNST_LIMB(0xdca258dc), CNST_LIMB(0xbf3e7a1), CNST_LIMB(0x56aebc07) }, + /* 120 */ { 4, CNST_LIMB(0x25107ed5), CNST_LIMB(0xdd053f6d), CNST_LIMB(0xc5c1000), CNST_LIMB(0x4b66dc33) }, + /* 121 */ { 4, CNST_LIMB(0x25001383), CNST_LIMB(0xdd6753e0), CNST_LIMB(0xcc6db61), CNST_LIMB(0x4094d8a3) }, + /* 122 */ { 4, CNST_LIMB(0x24efd921), CNST_LIMB(0xddc899ab), CNST_LIMB(0xd345510), CNST_LIMB(0x3632f7a5) }, + /* 123 */ { 4, CNST_LIMB(0x24dfceb3), CNST_LIMB(0xde29142e), CNST_LIMB(0xda48871), CNST_LIMB(0x2c3bd1f0) }, + /* 124 */ { 4, CNST_LIMB(0x24cff343), CNST_LIMB(0xde88c6b3), CNST_LIMB(0xe178100), CNST_LIMB(0x22aa4d5f) }, + /* 125 */ { 4, CNST_LIMB(0x24c045e1), CNST_LIMB(0xdee7b471), CNST_LIMB(0xe8d4a51), CNST_LIMB(0x19799812) }, + /* 126 */ { 4, CNST_LIMB(0x24b0c5a6), CNST_LIMB(0xdf45e08b), CNST_LIMB(0xf05f010), CNST_LIMB(0x10a523e5) }, + /* 127 */ { 4, CNST_LIMB(0x24a171b0), CNST_LIMB(0xdfa34e11), CNST_LIMB(0xf817e01), CNST_LIMB(0x828a237) }, + /* 128 */ { 4, CNST_LIMB(0x24924924), CNST_LIMB(0xdfffffff), CNST_LIMB(0x7), CNST_LIMB(0x0) }, + /* 129 */ { 4, CNST_LIMB(0x24834b2c), CNST_LIMB(0xe05bf942), CNST_LIMB(0x10818201), CNST_LIMB(0xf04ec452) }, + /* 130 */ { 4, CNST_LIMB(0x247476f9), CNST_LIMB(0xe0b73cb4), CNST_LIMB(0x11061010), CNST_LIMB(0xe136444a) }, + /* 131 */ { 4, CNST_LIMB(0x2465cbc0), CNST_LIMB(0xe111cd1d), CNST_LIMB(0x118db651), CNST_LIMB(0xd2af9589) }, + /* 132 */ { 4, CNST_LIMB(0x245748bc), CNST_LIMB(0xe16bad37), CNST_LIMB(0x12188100), CNST_LIMB(0xc4b42a83) }, + /* 133 */ { 4, CNST_LIMB(0x2448ed2f), CNST_LIMB(0xe1c4dfab), CNST_LIMB(0x12a67c71), CNST_LIMB(0xb73dccf5) }, + /* 134 */ { 4, CNST_LIMB(0x243ab85d), CNST_LIMB(0xe21d6713), CNST_LIMB(0x1337b510), CNST_LIMB(0xaa4698c5) }, + /* 135 */ { 4, CNST_LIMB(0x242ca992), CNST_LIMB(0xe27545fb), CNST_LIMB(0x13cc3761), CNST_LIMB(0x9dc8f729) }, + /* 136 */ { 4, CNST_LIMB(0x241ec01b), CNST_LIMB(0xe2cc7edf), CNST_LIMB(0x14641000), CNST_LIMB(0x91bf9a30) }, + /* 137 */ { 4, CNST_LIMB(0x2410fb4d), CNST_LIMB(0xe323142d), CNST_LIMB(0x14ff4ba1), CNST_LIMB(0x86257887) }, + /* 138 */ { 4, CNST_LIMB(0x24035a80), CNST_LIMB(0xe3790848), CNST_LIMB(0x159df710), CNST_LIMB(0x7af5c98c) }, + /* 139 */ { 4, CNST_LIMB(0x23f5dd10), CNST_LIMB(0xe3ce5d82), CNST_LIMB(0x16401f31), CNST_LIMB(0x702c01a0) }, + /* 140 */ { 4, CNST_LIMB(0x23e8825d), CNST_LIMB(0xe4231623), CNST_LIMB(0x16e5d100), CNST_LIMB(0x65c3ceb1) }, + /* 141 */ { 4, CNST_LIMB(0x23db49cc), CNST_LIMB(0xe4773465), CNST_LIMB(0x178f1991), CNST_LIMB(0x5bb91502) }, + /* 142 */ { 4, CNST_LIMB(0x23ce32c4), CNST_LIMB(0xe4caba78), CNST_LIMB(0x183c0610), CNST_LIMB(0x5207ec23) }, + /* 143 */ { 4, CNST_LIMB(0x23c13cb3), CNST_LIMB(0xe51daa7e), CNST_LIMB(0x18eca3c1), CNST_LIMB(0x48ac9c19) }, + /* 144 */ { 4, CNST_LIMB(0x23b46706), CNST_LIMB(0xe570068e), CNST_LIMB(0x19a10000), CNST_LIMB(0x3fa39ab5) }, + /* 145 */ { 4, CNST_LIMB(0x23a7b132), CNST_LIMB(0xe5c1d0b5), CNST_LIMB(0x1a592841), CNST_LIMB(0x36e98912) }, + /* 146 */ { 4, CNST_LIMB(0x239b1aac), CNST_LIMB(0xe6130af4), CNST_LIMB(0x1b152a10), CNST_LIMB(0x2e7b3140) }, + /* 147 */ { 4, CNST_LIMB(0x238ea2ef), CNST_LIMB(0xe663b741), CNST_LIMB(0x1bd51311), CNST_LIMB(0x2655840b) }, + /* 148 */ { 4, CNST_LIMB(0x23824976), CNST_LIMB(0xe6b3d78b), CNST_LIMB(0x1c98f100), CNST_LIMB(0x1e7596ea) }, + /* 149 */ { 4, CNST_LIMB(0x23760dc3), CNST_LIMB(0xe7036db3), CNST_LIMB(0x1d60d1b1), CNST_LIMB(0x16d8a20d) }, + /* 150 */ { 4, CNST_LIMB(0x2369ef58), CNST_LIMB(0xe7527b93), CNST_LIMB(0x1e2cc310), CNST_LIMB(0xf7bfe87) }, + /* 151 */ { 4, CNST_LIMB(0x235dedbb), CNST_LIMB(0xe7a102f9), CNST_LIMB(0x1efcd321), CNST_LIMB(0x85d2492) }, + /* 152 */ { 4, CNST_LIMB(0x23520874), CNST_LIMB(0xe7ef05ae), CNST_LIMB(0x1fd11000), CNST_LIMB(0x179a9f4) }, + /* 153 */ { 4, CNST_LIMB(0x23463f10), CNST_LIMB(0xe83c856d), CNST_LIMB(0x20a987e1), CNST_LIMB(0xf59e80eb) }, + /* 154 */ { 4, CNST_LIMB(0x233a911b), CNST_LIMB(0xe88983ed), CNST_LIMB(0x21864910), CNST_LIMB(0xe8b768db) }, + /* 155 */ { 4, CNST_LIMB(0x232efe26), CNST_LIMB(0xe8d602d9), CNST_LIMB(0x226761f1), CNST_LIMB(0xdc39d6d5) }, + /* 156 */ { 4, CNST_LIMB(0x232385c6), CNST_LIMB(0xe92203d5), CNST_LIMB(0x234ce100), CNST_LIMB(0xd021c5d1) }, + /* 157 */ { 4, CNST_LIMB(0x2318278e), CNST_LIMB(0xe96d887e), CNST_LIMB(0x2436d4d1), CNST_LIMB(0xc46b5e37) }, + /* 158 */ { 4, CNST_LIMB(0x230ce318), CNST_LIMB(0xe9b89267), CNST_LIMB(0x25254c10), CNST_LIMB(0xb912f39c) }, + /* 159 */ { 4, CNST_LIMB(0x2301b7fd), CNST_LIMB(0xea03231d), CNST_LIMB(0x26185581), CNST_LIMB(0xae150294) }, + /* 160 */ { 4, CNST_LIMB(0x22f6a5d9), CNST_LIMB(0xea4d3c25), CNST_LIMB(0x27100000), CNST_LIMB(0xa36e2eb1) }, + /* 161 */ { 4, CNST_LIMB(0x22ebac4c), CNST_LIMB(0xea96defe), CNST_LIMB(0x280c5a81), CNST_LIMB(0x991b4094) }, + /* 162 */ { 4, CNST_LIMB(0x22e0caf6), CNST_LIMB(0xeae00d1c), CNST_LIMB(0x290d7410), CNST_LIMB(0x8f19241e) }, + /* 163 */ { 4, CNST_LIMB(0x22d60179), CNST_LIMB(0xeb28c7f2), CNST_LIMB(0x2a135bd1), CNST_LIMB(0x8564e6b7) }, + /* 164 */ { 4, CNST_LIMB(0x22cb4f7a), CNST_LIMB(0xeb7110e6), CNST_LIMB(0x2b1e2100), CNST_LIMB(0x7bfbb5b4) }, + /* 165 */ { 4, CNST_LIMB(0x22c0b4a1), CNST_LIMB(0xebb8e95d), CNST_LIMB(0x2c2dd2f1), CNST_LIMB(0x72dadcc8) }, + /* 166 */ { 4, CNST_LIMB(0x22b63095), CNST_LIMB(0xec0052b1), CNST_LIMB(0x2d428110), CNST_LIMB(0x69ffc498) }, + /* 167 */ { 4, CNST_LIMB(0x22abc300), CNST_LIMB(0xec474e39), CNST_LIMB(0x2e5c3ae1), CNST_LIMB(0x6167f154) }, + /* 168 */ { 4, CNST_LIMB(0x22a16b90), CNST_LIMB(0xec8ddd44), CNST_LIMB(0x2f7b1000), CNST_LIMB(0x5911016e) }, + /* 169 */ { 4, CNST_LIMB(0x229729f1), CNST_LIMB(0xecd4011c), CNST_LIMB(0x309f1021), CNST_LIMB(0x50f8ac5f) }, + /* 170 */ { 4, CNST_LIMB(0x228cfdd4), CNST_LIMB(0xed19bb05), CNST_LIMB(0x31c84b10), CNST_LIMB(0x491cc17c) }, + /* 171 */ { 4, CNST_LIMB(0x2282e6e9), CNST_LIMB(0xed5f0c3c), CNST_LIMB(0x32f6d0b1), CNST_LIMB(0x417b26d8) }, + /* 172 */ { 4, CNST_LIMB(0x2278e4e3), CNST_LIMB(0xeda3f5fb), CNST_LIMB(0x342ab100), CNST_LIMB(0x3a11d83b) }, + /* 173 */ { 4, CNST_LIMB(0x226ef777), CNST_LIMB(0xede87974), CNST_LIMB(0x3563fc11), CNST_LIMB(0x32dee622) }, + /* 174 */ { 4, CNST_LIMB(0x22651e5a), CNST_LIMB(0xee2c97d6), CNST_LIMB(0x36a2c210), CNST_LIMB(0x2be074cd) }, + /* 175 */ { 4, CNST_LIMB(0x225b5944), CNST_LIMB(0xee705249), CNST_LIMB(0x37e71341), CNST_LIMB(0x2514bb58) }, + /* 176 */ { 4, CNST_LIMB(0x2251a7ee), CNST_LIMB(0xeeb3a9f0), CNST_LIMB(0x39310000), CNST_LIMB(0x1e7a02e7) }, + /* 177 */ { 4, CNST_LIMB(0x22480a11), CNST_LIMB(0xeef69fea), CNST_LIMB(0x3a8098c1), CNST_LIMB(0x180ea5d0) }, + /* 178 */ { 4, CNST_LIMB(0x223e7f69), CNST_LIMB(0xef393550), CNST_LIMB(0x3bd5ee10), CNST_LIMB(0x11d10edd) }, + /* 179 */ { 4, CNST_LIMB(0x223507b4), CNST_LIMB(0xef7b6b39), CNST_LIMB(0x3d311091), CNST_LIMB(0xbbfb88e) }, + /* 180 */ { 4, CNST_LIMB(0x222ba2af), CNST_LIMB(0xefbd42b4), CNST_LIMB(0x3e921100), CNST_LIMB(0x5d92c68) }, + /* 181 */ { 4, CNST_LIMB(0x22225019), CNST_LIMB(0xeffebccd), CNST_LIMB(0x3ff90031), CNST_LIMB(0x1c024c) }, + /* 182 */ { 4, CNST_LIMB(0x22190fb4), CNST_LIMB(0xf03fda8b), CNST_LIMB(0x4165ef10), CNST_LIMB(0xf50dbfb2) }, + /* 183 */ { 4, CNST_LIMB(0x220fe141), CNST_LIMB(0xf0809cf2), CNST_LIMB(0x42d8eea1), CNST_LIMB(0xea30efa3) }, + /* 184 */ { 4, CNST_LIMB(0x2206c483), CNST_LIMB(0xf0c10500), CNST_LIMB(0x44521000), CNST_LIMB(0xdf9f1316) }, + /* 185 */ { 4, CNST_LIMB(0x21fdb93f), CNST_LIMB(0xf10113b1), CNST_LIMB(0x45d16461), CNST_LIMB(0xd555c0c9) }, + /* 186 */ { 4, CNST_LIMB(0x21f4bf3a), CNST_LIMB(0xf140c9fa), CNST_LIMB(0x4756fd10), CNST_LIMB(0xcb52a684) }, + /* 187 */ { 4, CNST_LIMB(0x21ebd639), CNST_LIMB(0xf18028cf), CNST_LIMB(0x48e2eb71), CNST_LIMB(0xc193881f) }, + /* 188 */ { 4, CNST_LIMB(0x21e2fe06), CNST_LIMB(0xf1bf311e), CNST_LIMB(0x4a754100), CNST_LIMB(0xb8163e97) }, + /* 189 */ { 4, CNST_LIMB(0x21da3667), CNST_LIMB(0xf1fde3d3), CNST_LIMB(0x4c0e0f51), CNST_LIMB(0xaed8b724) }, + /* 190 */ { 4, CNST_LIMB(0x21d17f28), CNST_LIMB(0xf23c41d4), CNST_LIMB(0x4dad6810), CNST_LIMB(0xa5d8f269) }, + /* 191 */ { 4, CNST_LIMB(0x21c8d811), CNST_LIMB(0xf27a4c05), CNST_LIMB(0x4f535d01), CNST_LIMB(0x9d15039d) }, + /* 192 */ { 4, CNST_LIMB(0x21c040ef), CNST_LIMB(0xf2b80347), CNST_LIMB(0x51000000), CNST_LIMB(0x948b0fcd) }, + /* 193 */ { 4, CNST_LIMB(0x21b7b98f), CNST_LIMB(0xf2f56875), CNST_LIMB(0x52b36301), CNST_LIMB(0x8c394d1d) }, + /* 194 */ { 4, CNST_LIMB(0x21af41bc), CNST_LIMB(0xf3327c6a), CNST_LIMB(0x546d9810), CNST_LIMB(0x841e0215) }, + /* 195 */ { 4, CNST_LIMB(0x21a6d947), CNST_LIMB(0xf36f3ffb), CNST_LIMB(0x562eb151), CNST_LIMB(0x7c3784f8) }, + /* 196 */ { 4, CNST_LIMB(0x219e7ffd), CNST_LIMB(0xf3abb3fa), CNST_LIMB(0x57f6c100), CNST_LIMB(0x74843b1e) }, + /* 197 */ { 4, CNST_LIMB(0x219635af), CNST_LIMB(0xf3e7d937), CNST_LIMB(0x59c5d971), CNST_LIMB(0x6d02985d) }, + /* 198 */ { 4, CNST_LIMB(0x218dfa2e), CNST_LIMB(0xf423b07e), CNST_LIMB(0x5b9c0d10), CNST_LIMB(0x65b11e6e) }, + /* 199 */ { 4, CNST_LIMB(0x2185cd4c), CNST_LIMB(0xf45f3a98), CNST_LIMB(0x5d796e61), CNST_LIMB(0x5e8e5c64) }, + /* 200 */ { 4, CNST_LIMB(0x217daeda), CNST_LIMB(0xf49a784b), CNST_LIMB(0x5f5e1000), CNST_LIMB(0x5798ee23) }, + /* 201 */ { 4, CNST_LIMB(0x21759eac), CNST_LIMB(0xf4d56a5b), CNST_LIMB(0x614a04a1), CNST_LIMB(0x50cf7bde) }, + /* 202 */ { 4, CNST_LIMB(0x216d9c96), CNST_LIMB(0xf5101187), CNST_LIMB(0x633d5f10), CNST_LIMB(0x4a30b99b) }, + /* 203 */ { 4, CNST_LIMB(0x2165a86e), CNST_LIMB(0xf54a6e8c), CNST_LIMB(0x65383231), CNST_LIMB(0x43bb66bd) }, + /* 204 */ { 4, CNST_LIMB(0x215dc207), CNST_LIMB(0xf5848226), CNST_LIMB(0x673a9100), CNST_LIMB(0x3d6e4d94) }, + /* 205 */ { 4, CNST_LIMB(0x2155e939), CNST_LIMB(0xf5be4d0c), CNST_LIMB(0x69448e91), CNST_LIMB(0x374842ee) }, + /* 206 */ { 4, CNST_LIMB(0x214e1ddb), CNST_LIMB(0xf5f7cff4), CNST_LIMB(0x6b563e10), CNST_LIMB(0x314825b0) }, + /* 207 */ { 4, CNST_LIMB(0x21465fc4), CNST_LIMB(0xf6310b8f), CNST_LIMB(0x6d6fb2c1), CNST_LIMB(0x2b6cde75) }, + /* 208 */ { 4, CNST_LIMB(0x213eaecd), CNST_LIMB(0xf66a008e), CNST_LIMB(0x6f910000), CNST_LIMB(0x25b55f2e) }, + /* 209 */ { 4, CNST_LIMB(0x21370ace), CNST_LIMB(0xf6a2af9e), CNST_LIMB(0x71ba3941), CNST_LIMB(0x2020a2c5) }, + /* 210 */ { 4, CNST_LIMB(0x212f73a0), CNST_LIMB(0xf6db196a), CNST_LIMB(0x73eb7210), CNST_LIMB(0x1aadaccb) }, + /* 211 */ { 4, CNST_LIMB(0x2127e920), CNST_LIMB(0xf7133e9b), CNST_LIMB(0x7624be11), CNST_LIMB(0x155b891f) }, + /* 212 */ { 4, CNST_LIMB(0x21206b26), CNST_LIMB(0xf74b1fd6), CNST_LIMB(0x78663100), CNST_LIMB(0x10294ba2) }, + /* 213 */ { 4, CNST_LIMB(0x2118f98f), CNST_LIMB(0xf782bdbf), CNST_LIMB(0x7aafdeb1), CNST_LIMB(0xb160fe9) }, + /* 214 */ { 4, CNST_LIMB(0x21119436), CNST_LIMB(0xf7ba18f9), CNST_LIMB(0x7d01db10), CNST_LIMB(0x620f8f6) }, + /* 215 */ { 4, CNST_LIMB(0x210a3af8), CNST_LIMB(0xf7f13221), CNST_LIMB(0x7f5c3a21), CNST_LIMB(0x14930ef) }, + /* 216 */ { 4, CNST_LIMB(0x2102edb3), CNST_LIMB(0xf82809d5), CNST_LIMB(0x81bf1000), CNST_LIMB(0xf91bd1b6) }, + /* 217 */ { 4, CNST_LIMB(0x20fbac44), CNST_LIMB(0xf85ea0b0), CNST_LIMB(0x842a70e1), CNST_LIMB(0xefdcb0c7) }, + /* 218 */ { 4, CNST_LIMB(0x20f4768a), CNST_LIMB(0xf894f74b), CNST_LIMB(0x869e7110), CNST_LIMB(0xe6d37b2a) }, + /* 219 */ { 4, CNST_LIMB(0x20ed4c62), CNST_LIMB(0xf8cb0e3b), CNST_LIMB(0x891b24f1), CNST_LIMB(0xddfeb94a) }, + /* 220 */ { 4, CNST_LIMB(0x20e62dae), CNST_LIMB(0xf900e615), CNST_LIMB(0x8ba0a100), CNST_LIMB(0xd55cff6e) }, + /* 221 */ { 4, CNST_LIMB(0x20df1a4b), CNST_LIMB(0xf9367f6d), CNST_LIMB(0x8e2ef9d1), CNST_LIMB(0xcceced50) }, + /* 222 */ { 4, CNST_LIMB(0x20d8121c), CNST_LIMB(0xf96bdad2), CNST_LIMB(0x90c64410), CNST_LIMB(0xc4ad2db2) }, + /* 223 */ { 4, CNST_LIMB(0x20d11500), CNST_LIMB(0xf9a0f8d3), CNST_LIMB(0x93669481), CNST_LIMB(0xbc9c75f9) }, + /* 224 */ { 4, CNST_LIMB(0x20ca22d9), CNST_LIMB(0xf9d5d9fd), CNST_LIMB(0x96100000), CNST_LIMB(0xb4b985cf) }, + /* 225 */ { 4, CNST_LIMB(0x20c33b88), CNST_LIMB(0xfa0a7eda), CNST_LIMB(0x98c29b81), CNST_LIMB(0xad0326c2) }, + /* 226 */ { 4, CNST_LIMB(0x20bc5ef1), CNST_LIMB(0xfa3ee7f3), CNST_LIMB(0x9b7e7c10), CNST_LIMB(0xa5782bef) }, + /* 227 */ { 4, CNST_LIMB(0x20b58cf5), CNST_LIMB(0xfa7315d0), CNST_LIMB(0x9e43b6d1), CNST_LIMB(0x9e1771a9) }, + /* 228 */ { 4, CNST_LIMB(0x20aec579), CNST_LIMB(0xfaa708f5), CNST_LIMB(0xa1126100), CNST_LIMB(0x96dfdd2a) }, + /* 229 */ { 4, CNST_LIMB(0x20a8085e), CNST_LIMB(0xfadac1e7), CNST_LIMB(0xa3ea8ff1), CNST_LIMB(0x8fd05c41) }, + /* 230 */ { 4, CNST_LIMB(0x20a1558b), CNST_LIMB(0xfb0e4126), CNST_LIMB(0xa6cc5910), CNST_LIMB(0x88e7e509) }, + /* 231 */ { 4, CNST_LIMB(0x209aace2), CNST_LIMB(0xfb418734), CNST_LIMB(0xa9b7d1e1), CNST_LIMB(0x8225759d) }, + /* 232 */ { 4, CNST_LIMB(0x20940e49), CNST_LIMB(0xfb74948f), CNST_LIMB(0xacad1000), CNST_LIMB(0x7b8813d3) }, + /* 233 */ { 4, CNST_LIMB(0x208d79a5), CNST_LIMB(0xfba769b3), CNST_LIMB(0xafac2921), CNST_LIMB(0x750eccf9) }, + /* 234 */ { 4, CNST_LIMB(0x2086eedb), CNST_LIMB(0xfbda071c), CNST_LIMB(0xb2b53310), CNST_LIMB(0x6eb8b595) }, + /* 235 */ { 4, CNST_LIMB(0x20806dd2), CNST_LIMB(0xfc0c6d44), CNST_LIMB(0xb5c843b1), CNST_LIMB(0x6884e923) }, + /* 236 */ { 4, CNST_LIMB(0x2079f671), CNST_LIMB(0xfc3e9ca2), CNST_LIMB(0xb8e57100), CNST_LIMB(0x627289db) }, + /* 237 */ { 4, CNST_LIMB(0x2073889d), CNST_LIMB(0xfc7095ae), CNST_LIMB(0xbc0cd111), CNST_LIMB(0x5c80c07b) }, + /* 238 */ { 4, CNST_LIMB(0x206d243e), CNST_LIMB(0xfca258dc), CNST_LIMB(0xbf3e7a10), CNST_LIMB(0x56aebc07) }, + /* 239 */ { 4, CNST_LIMB(0x2066c93c), CNST_LIMB(0xfcd3e6a0), CNST_LIMB(0xc27a8241), CNST_LIMB(0x50fbb19b) }, + /* 240 */ { 4, CNST_LIMB(0x2060777e), CNST_LIMB(0xfd053f6d), CNST_LIMB(0xc5c10000), CNST_LIMB(0x4b66dc33) }, + /* 241 */ { 4, CNST_LIMB(0x205a2eed), CNST_LIMB(0xfd3663b2), CNST_LIMB(0xc91209c1), CNST_LIMB(0x45ef7c7c) }, + /* 242 */ { 4, CNST_LIMB(0x2053ef71), CNST_LIMB(0xfd6753e0), CNST_LIMB(0xcc6db610), CNST_LIMB(0x4094d8a3) }, + /* 243 */ { 4, CNST_LIMB(0x204db8f3), CNST_LIMB(0xfd981064), CNST_LIMB(0xcfd41b91), CNST_LIMB(0x3b563c24) }, + /* 244 */ { 4, CNST_LIMB(0x20478b5c), CNST_LIMB(0xfdc899ab), CNST_LIMB(0xd3455100), CNST_LIMB(0x3632f7a5) }, + /* 245 */ { 4, CNST_LIMB(0x20416696), CNST_LIMB(0xfdf8f020), CNST_LIMB(0xd6c16d31), CNST_LIMB(0x312a60c3) }, + /* 246 */ { 4, CNST_LIMB(0x203b4a8b), CNST_LIMB(0xfe29142e), CNST_LIMB(0xda488710), CNST_LIMB(0x2c3bd1f0) }, + /* 247 */ { 4, CNST_LIMB(0x20353725), CNST_LIMB(0xfe59063c), CNST_LIMB(0xdddab5a1), CNST_LIMB(0x2766aa45) }, + /* 248 */ { 4, CNST_LIMB(0x202f2c4e), CNST_LIMB(0xfe88c6b3), CNST_LIMB(0xe1781000), CNST_LIMB(0x22aa4d5f) }, + /* 249 */ { 4, CNST_LIMB(0x202929f0), CNST_LIMB(0xfeb855f8), CNST_LIMB(0xe520ad61), CNST_LIMB(0x1e06233c) }, + /* 250 */ { 4, CNST_LIMB(0x20232ff8), CNST_LIMB(0xfee7b471), CNST_LIMB(0xe8d4a510), CNST_LIMB(0x19799812) }, + /* 251 */ { 4, CNST_LIMB(0x201d3e50), CNST_LIMB(0xff16e281), CNST_LIMB(0xec940e71), CNST_LIMB(0x15041c33) }, + /* 252 */ { 4, CNST_LIMB(0x201754e5), CNST_LIMB(0xff45e08b), CNST_LIMB(0xf05f0100), CNST_LIMB(0x10a523e5) }, + /* 253 */ { 4, CNST_LIMB(0x201173a1), CNST_LIMB(0xff74aef0), CNST_LIMB(0xf4359451), CNST_LIMB(0xc5c2749) }, + /* 254 */ { 4, CNST_LIMB(0x200b9a71), CNST_LIMB(0xffa34e11), CNST_LIMB(0xf817e010), CNST_LIMB(0x828a237) }, + /* 255 */ { 4, CNST_LIMB(0x2005c942), CNST_LIMB(0xffd1be4c), CNST_LIMB(0xfc05fc01), CNST_LIMB(0x40a1423) }, + /* 256 */ { 4, CNST_LIMB(0x1fffffff), CNST_LIMB(0xffffffff), CNST_LIMB(0x8), CNST_LIMB(0x0) }, +}; diff --git a/gmp-6.3.0/mpn/mu_bdiv_q.c b/gmp-6.3.0/mpn/mu_bdiv_q.c new file mode 120000 index 0000000..9b04f38 --- /dev/null +++ b/gmp-6.3.0/mpn/mu_bdiv_q.c @@ -0,0 +1 @@ +../mpn/generic/mu_bdiv_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mu_bdiv_qr.c b/gmp-6.3.0/mpn/mu_bdiv_qr.c new file mode 120000 index 0000000..e6f2ed3 --- /dev/null +++ b/gmp-6.3.0/mpn/mu_bdiv_qr.c @@ -0,0 +1 @@ +../mpn/generic/mu_bdiv_qr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mu_div_q.c b/gmp-6.3.0/mpn/mu_div_q.c new file mode 120000 index 0000000..470e705 --- /dev/null +++ b/gmp-6.3.0/mpn/mu_div_q.c @@ -0,0 +1 @@ +../mpn/generic/mu_div_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mu_div_qr.c b/gmp-6.3.0/mpn/mu_div_qr.c new file mode 120000 index 0000000..4566196 --- /dev/null +++ b/gmp-6.3.0/mpn/mu_div_qr.c @@ -0,0 +1 @@ +../mpn/generic/mu_div_qr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mu_divappr_q.c b/gmp-6.3.0/mpn/mu_divappr_q.c new file mode 120000 index 0000000..a06e1cd --- /dev/null +++ b/gmp-6.3.0/mpn/mu_divappr_q.c @@ -0,0 +1 @@ +../mpn/generic/mu_divappr_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mul.c b/gmp-6.3.0/mpn/mul.c new file mode 120000 index 0000000..f7ede4f --- /dev/null +++ b/gmp-6.3.0/mpn/mul.c @@ -0,0 +1 @@ +../mpn/generic/mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mul_1.asm b/gmp-6.3.0/mpn/mul_1.asm new file mode 120000 index 0000000..6062cf8 --- /dev/null +++ b/gmp-6.3.0/mpn/mul_1.asm @@ -0,0 +1 @@ +../mpn/x86/p6/sse2/mul_1.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mul_basecase.asm b/gmp-6.3.0/mpn/mul_basecase.asm new file mode 120000 index 0000000..ecc410e --- /dev/null +++ b/gmp-6.3.0/mpn/mul_basecase.asm @@ -0,0 +1 @@ +../mpn/x86/p6/sse2/mul_basecase.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mul_fft.c b/gmp-6.3.0/mpn/mul_fft.c new file mode 120000 index 0000000..b064cd2 --- /dev/null +++ b/gmp-6.3.0/mpn/mul_fft.c @@ -0,0 +1 @@ +../mpn/generic/mul_fft.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mul_n.c b/gmp-6.3.0/mpn/mul_n.c new file mode 120000 index 0000000..de41b2e --- /dev/null +++ b/gmp-6.3.0/mpn/mul_n.c @@ -0,0 +1 @@ +../mpn/generic/mul_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mullo_basecase.c b/gmp-6.3.0/mpn/mullo_basecase.c new file mode 120000 index 0000000..948d1f3 --- /dev/null +++ b/gmp-6.3.0/mpn/mullo_basecase.c @@ -0,0 +1 @@ +../mpn/generic/mullo_basecase.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mullo_n.c b/gmp-6.3.0/mpn/mullo_n.c new file mode 120000 index 0000000..010baa8 --- /dev/null +++ b/gmp-6.3.0/mpn/mullo_n.c @@ -0,0 +1 @@ +../mpn/generic/mullo_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mulmid.c b/gmp-6.3.0/mpn/mulmid.c new file mode 120000 index 0000000..d90ce25 --- /dev/null +++ b/gmp-6.3.0/mpn/mulmid.c @@ -0,0 +1 @@ +../mpn/generic/mulmid.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mulmid_basecase.c b/gmp-6.3.0/mpn/mulmid_basecase.c new file mode 120000 index 0000000..eb3bc95 --- /dev/null +++ b/gmp-6.3.0/mpn/mulmid_basecase.c @@ -0,0 +1 @@ +../mpn/generic/mulmid_basecase.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mulmid_n.c b/gmp-6.3.0/mpn/mulmid_n.c new file mode 120000 index 0000000..5465264 --- /dev/null +++ b/gmp-6.3.0/mpn/mulmid_n.c @@ -0,0 +1 @@ +../mpn/generic/mulmid_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mulmod_bknp1.c b/gmp-6.3.0/mpn/mulmod_bknp1.c new file mode 120000 index 0000000..07e3a74 --- /dev/null +++ b/gmp-6.3.0/mpn/mulmod_bknp1.c @@ -0,0 +1 @@ +../mpn/generic/mulmod_bknp1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/mulmod_bnm1.c b/gmp-6.3.0/mpn/mulmod_bnm1.c new file mode 120000 index 0000000..397e0ef --- /dev/null +++ b/gmp-6.3.0/mpn/mulmod_bnm1.c @@ -0,0 +1 @@ +../mpn/generic/mulmod_bnm1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/nand_n.c b/gmp-6.3.0/mpn/nand_n.c new file mode 120000 index 0000000..0a553d9 --- /dev/null +++ b/gmp-6.3.0/mpn/nand_n.c @@ -0,0 +1 @@ +../mpn/generic/logops_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/neg.c b/gmp-6.3.0/mpn/neg.c new file mode 120000 index 0000000..8774bf5 --- /dev/null +++ b/gmp-6.3.0/mpn/neg.c @@ -0,0 +1 @@ +../mpn/generic/neg.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/nior_n.c b/gmp-6.3.0/mpn/nior_n.c new file mode 120000 index 0000000..0a553d9 --- /dev/null +++ b/gmp-6.3.0/mpn/nior_n.c @@ -0,0 +1 @@ +../mpn/generic/logops_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/nussbaumer_mul.c b/gmp-6.3.0/mpn/nussbaumer_mul.c new file mode 120000 index 0000000..519e30c --- /dev/null +++ b/gmp-6.3.0/mpn/nussbaumer_mul.c @@ -0,0 +1 @@ +../mpn/generic/nussbaumer_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/pa32/README b/gmp-6.3.0/mpn/pa32/README new file mode 100644 index 0000000..4323390 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/README @@ -0,0 +1,162 @@ +Copyright 1996, 1999, 2001, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + +This directory contains mpn functions for various HP PA-RISC chips. Code +that runs faster on the PA7100 and later implementations, is in the pa7100 +directory. + +RELEVANT OPTIMIZATION ISSUES + + Load and Store timing + +On the PA7000 no memory instructions can issue the two cycles after a store. +For the PA7100, this is reduced to one cycle. + +The PA7100 has a lookup-free cache, so it helps to schedule loads and the +dependent instruction really far from each other. + +STATUS + +1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the + instructions below (but some sw pipelining is needed to avoid the + xmpyu-fstds delay): + + fldds s1_ptr + + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + + addc + stws res_ptr + addc + stws res_ptr + + addib Loop + +2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb + (asymptotically) on the PA7100, using the instructions below. With proper + sw pipelining and the unrolling level below, the speed becomes 8 + cycles/limb. + + fldds s1_ptr + fldds s1_ptr + + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + addc + addc + addc + addc + addc %r0,%r0,cy-limb + + ldws res_ptr + ldws res_ptr + ldws res_ptr + ldws res_ptr + add + stws res_ptr + addc + stws res_ptr + addc + stws res_ptr + addc + stws res_ptr + + addib + +3. For the PA8000 we have to stick to using 32-bit limbs before compiler + support emerges. But we want to use 64-bit operations whenever possible, + in particular for loads and stores. It is possible to handle mpn_add_n + efficiently by rotating (when s1/s2 are aligned), masking+bit field + inserting when (they are not). The speed should double compared to the + code used today. + + + + +LABEL SYNTAX + +The HP-UX assembler takes labels starting in column 0 with no colon, + + L$loop ldws,mb -4(0,%r25),%r22 + +Gas on hppa GNU/Linux however requires a colon, + + L$loop: ldws,mb -4(0,%r25),%r22 + +This is covered by using LDEF() from asm-defs.m4. An alternative would be +to use ".label" which is accepted by both, + + .label L$loop + ldws,mb -4(0,%r25),%r22 + +but that's not as nice to look at, not if you're used to assembler code +having labels in column 0. + + + + +REFERENCES + +Hewlett Packard, "HP Assembler Reference Manual", 9th edition, June 1998, +part number 92432-90012. + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/pa32/add_n.asm b/gmp-6.3.0/mpn/pa32/add_n.asm new file mode 100644 index 0000000..46f3937 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/add_n.asm @@ -0,0 +1,63 @@ +dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C One might want to unroll this as for other processors, but it turns out that +C the data cache contention after a store makes such unrolling useless. We +C can't come under 5 cycles/limb anyway. + +ASM_START() +PROLOGUE(mpn_add_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L(end) C check for (SIZE == 1) + add %r20,%r19,%r28 C add first limbs ignoring cy + +LDEF(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L(loop) + addc %r20,%r19,%r28 + +LDEF(end) + stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/gmp-mparam.h b/gmp-6.3.0/mpn/pa32/gmp-mparam.h new file mode 100644 index 0000000..377efcb --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/gmp-mparam.h @@ -0,0 +1,61 @@ +/* HP-PA 1.0 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* These values are for the PA7100 using GCC. */ +/* Generated by tuneup.c, 2000-10-27. */ + +#ifndef MUL_TOOM22_THRESHOLD +#define MUL_TOOM22_THRESHOLD 30 +#endif +#ifndef MUL_TOOM33_THRESHOLD +#define MUL_TOOM33_THRESHOLD 141 +#endif + +#ifndef SQR_TOOM2_THRESHOLD +#define SQR_TOOM2_THRESHOLD 59 +#endif +#ifndef SQR_TOOM3_THRESHOLD +#define SQR_TOOM3_THRESHOLD 177 +#endif + +#ifndef DIV_DC_THRESHOLD +#define DIV_DC_THRESHOLD 108 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 18 +#endif + +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 33 +#endif diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm new file mode 100644 index 0000000..ec2f219 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/addmul_1.asm @@ -0,0 +1,106 @@ +dnl HP-PA 1.1 mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 11 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 10 cycles/limb. + +C There are some ideas described in mul_1.asm that applies to this code too. + +ASM_START() +PROLOGUE(mpn_addmul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop +LDEF(loop) + ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + +LDEF(end) + ldw 0(%r26),%r29 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +LDEF(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + add %r29,%r1,%r19 + stw %r19,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h b/gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h new file mode 100644 index 0000000..1261b24 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/gmp-mparam.h @@ -0,0 +1,72 @@ +/* HP-PA 1.1 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* Generated by tuneup.c, 2004-02-07, gcc 2.8 (pa7100/100MHz) */ + +#define MUL_TOOM22_THRESHOLD 30 +#define MUL_TOOM33_THRESHOLD 89 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 55 +#define SQR_TOOM3_THRESHOLD 101 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 84 +#define POWM_THRESHOLD 166 + +#define HGCD_THRESHOLD 231 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 823 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 5 +#define DIVREM_1_UNNORM_THRESHOLD 11 +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_THRESHOLD 6589 + +#define MUL_FFT_TABLE { 464, 928, 1920, 4608, 14336, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 480 +#define MUL_FFT_THRESHOLD 3328 + +#define SQR_FFT_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 520 +#define SQR_FFT_THRESHOLD 3328 diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm new file mode 100644 index 0000000..6e60c2f --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/mul_1.asm @@ -0,0 +1,102 @@ +dnl HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 9 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 7 cycles/limb. + +C We could use fldds to read two limbs at a time from the S1 array, and that +C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and +C PA7100, respectively. We don't do that since it does not seem worth the +C (alignment) troubles... + +C At least the PA7100 is rumored to be able to deal with cache-misses without +C stalling instruction issue. If this is true, and the cache is actually also +C lockup-free, we should use a deeper software pipeline, and load from S1 very +C early! (The loads and stores to -12(sp) will surely be in the cache.) + +ASM_START() +PROLOGUE(mpn_mul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop +LDEF(loop) + fldws,ma 4(%r25),%fr5 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + +LDEF(end) + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +LDEF(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + fstds %fr6,-16(%r30) + ldw -16(%r30),%r28 + ldo -64(%r30),%r30 + bv 0(%r2) + fstws %fr6R,0(%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm new file mode 100644 index 0000000..b96d403 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm @@ -0,0 +1,83 @@ +dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. Optimized for the PA7100, where is runs at +dnl 4.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C s2_ptr r24 +C size r23 + +ASM_START() +PROLOGUE(mpn_add_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L(rest) + add %r20,%r19,%r28 C add first limbs ignoring cy + +LDEF(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L(loop) + addc %r20,%r19,%r28 + +LDEF(rest) + addib,= 4,%r23,L(end) + nop + +LDEF(eloop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L(eloop) + addc %r20,%r19,%r28 + +LDEF(end) + stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm new file mode 100644 index 0000000..fb16100 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm @@ -0,0 +1,201 @@ +dnl HP-PA 7100/7200 mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`res_ptr',`%r26') +define(`s1_ptr',`%r25') +define(`size_param',`%r24') +define(`s2_limb',`%r23') + +define(`cylimb',`%r28') +define(`s0',`%r19') +define(`s1',`%r20') +define(`s2',`%r3') +define(`s3',`%r4') +define(`lo0',`%r21') +define(`lo1',`%r5') +define(`lo2',`%r6') +define(`lo3',`%r7') +define(`hi0',`%r22') +define(`hi1',`%r23') C safe to reuse +define(`hi2',`%r29') +define(`hi3',`%r1') + +ASM_START() +PROLOGUE(mpn_addmul_1) +C .callinfo frame=128,no_calls + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb C clear cy and cylimb + addib,< -4,size_param,L(few_limbs) + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L(0) + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + add s0,lo0,s0 + addib,< -1,size_param,L(few_limbs) + stws,ma s0,4(res_ptr) + +C start software pipeline ---------------------------------------------------- +LDEF(0) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size_param,L(end) + addc %r0,hi3,cylimb C propagate carry into cylimb +C main loop ------------------------------------------------------------------ +LDEF(loop) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + add s0,lo0,s0 + fstds %fr6,-8(%r31) + addc s1,lo1,s1 + fstds %fr9,0(%r31) + addc s2,lo2,s2 + fstds %fr10,8(%r31) + addc s3,lo3,s3 + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size_param,L(loop) + addc %r0,hi3,cylimb C propagate carry into cylimb +C finish software pipeline --------------------------------------------------- +LDEF(end) + ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addc s1,lo1,s1 + stws,ma s1,4(res_ptr) + addc s2,lo2,s2 + stws,ma s2,4(res_ptr) + addc s3,lo3,s3 + stws,ma s3,4(res_ptr) + +C restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + +LDEF(few_limbs) + addib,=,n 4,size_param,L(ret) + +LDEF(loop2) + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addib,<> -1,size_param,L(loop2) + nop + +LDEF(ret) + addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm new file mode 100644 index 0000000..d65db2a --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm @@ -0,0 +1,95 @@ +dnl HP-PA mpn_lshift -- Shift a number left. +dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s_ptr r25 +C size r24 +C cnt r23 + +ASM_START() +PROLOGUE(mpn_lshift) + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L(0004) + vshd %r0,%r22,%r28 C compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,<= -5,%r24,L(rest) + vshd %r22,%r29,%r20 + +LDEF(loop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + vshd %r22,%r29,%r20 + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -4,%r24,L(loop) + vshd %r22,%r29,%r20 + +LDEF(rest) + addib,= 4,%r24,L(end1) + nop + +LDEF(eloop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,<= -1,%r24,L(end2) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -1,%r24,L(eloop) + vshd %r22,%r29,%r20 + +LDEF(end1) + stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + +LDEF(end2) + stws,mb %r20,-4(0,%r26) + +LDEF(0004) + vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm new file mode 100644 index 0000000..f7896fc --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm @@ -0,0 +1,92 @@ +dnl HP-PA mpn_rshift -- Shift a number right. +dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s_ptr r25 +C size r24 +C cnt r23 + +ASM_START() +PROLOGUE(mpn_rshift) + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L(0004) + vshd %r22,%r0,%r28 C compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,<= -5,%r24,L(rest) + vshd %r29,%r22,%r20 + +LDEF(loop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + vshd %r29,%r22,%r20 + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -4,%r24,L(loop) + vshd %r29,%r22,%r20 + +LDEF(rest) + addib,= 4,%r24,L(end1) + nop + +LDEF(eloop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,<= -1,%r24,L(end2) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -1,%r24,L(eloop) + vshd %r29,%r22,%r20 + +LDEF(end1) + stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + +LDEF(end2) + stws,ma %r20,4(0,%r26) + +LDEF(0004) + vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm new file mode 100644 index 0000000..df3f6e8 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm @@ -0,0 +1,84 @@ +dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. Optimized for the PA7100, where +dnl is runs at 4.25 cycles/limb. + +dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C s2_ptr r24 +C size r23 + +ASM_START() +PROLOGUE(mpn_sub_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L(rest) + sub %r20,%r19,%r28 C subtract first limbs ignoring cy + +LDEF(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L(loop) + subb %r20,%r19,%r28 + +LDEF(rest) + addib,= 4,%r23,L(end) + nop + +LDEF(eloop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L(eloop) + subb %r20,%r19,%r28 + +LDEF(end) + stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm new file mode 100644 index 0000000..5ea08cb --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm @@ -0,0 +1,207 @@ +dnl HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`res_ptr',`%r26') +define(`s1_ptr',`%r25') +define(`size_param',`%r24') +define(`s2_limb',`%r23') + +define(`cylimb',`%r28') +define(`s0',`%r19') +define(`s1',`%r20') +define(`s2',`%r3') +define(`s3',`%r4') +define(`lo0',`%r21') +define(`lo1',`%r5') +define(`lo2',`%r6') +define(`lo3',`%r7') +define(`hi0',`%r22') +define(`hi1',`%r23') C safe to reuse +define(`hi2',`%r29') +define(`hi3',`%r1') + +ASM_START() +PROLOGUE(mpn_submul_1) +C .callinfo frame=128,no_calls + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb C clear cy and cylimb + addib,< -4,size_param,L(few_limbs) + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L(0) + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + sub s0,lo0,s0 + add s0,lo0,%r0 C invert cy + addib,< -1,size_param,L(few_limbs) + stws,ma s0,4(res_ptr) + +C start software pipeline ---------------------------------------------------- +LDEF(0) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size_param,L(end) + addc %r0,hi3,cylimb C propagate carry into cylimb +C main loop ------------------------------------------------------------------ +LDEF(loop) + fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + sub s0,lo0,s0 + fstds %fr6,-8(%r31) + subb s1,lo1,s1 + fstds %fr9,0(%r31) + subb s2,lo2,s2 + fstds %fr10,8(%r31) + subb s3,lo3,s3 + subb %r0,%r0,lo0 C these two insns ... + add lo0,lo0,%r0 C ... just invert cy + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size_param,L(loop) + addc %r0,hi3,cylimb C propagate carry into cylimb +C finish software pipeline --------------------------------------------------- +LDEF(end) + ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + sub s0,lo0,s0 + stws,ma s0,4(res_ptr) + subb s1,lo1,s1 + stws,ma s1,4(res_ptr) + subb s2,lo2,s2 + stws,ma s2,4(res_ptr) + subb s3,lo3,s3 + stws,ma s3,4(res_ptr) + subb %r0,%r0,lo0 C these two insns ... + add lo0,lo0,%r0 C ... invert cy + +C restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + +LDEF(few_limbs) + addib,=,n 4,size_param,L(ret) + +LDEF(loop2) + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + sub s0,lo0,s0 + add s0,lo0,%r0 C invert cy + stws,ma s0,4(res_ptr) + addib,<> -1,size_param,L(loop2) + nop + +LDEF(ret) + addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm new file mode 100644 index 0000000..1c7a18e --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/sqr_diagonal.asm @@ -0,0 +1,60 @@ +dnl HP-PA 1.1 32-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This code runs at 6 cycles/limb on the PA7100 and 2.5 cycles/limb on PA8x00. +C 2-way unrolling wouldn't help the PA7100; it could however bring times down +C to 2.0 cycles/limb for the PA8x00. + +C INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + ldo 4(rp),rp + fldws,ma 4(up),%fr4r + addib,= -1,n,L(exit) + xmpyu %fr4r,%fr4r,%fr5 + +LDEF(loop) + fldws,ma 4(up),%fr4r + fstws %fr5r,-4(rp) + fstws,ma %fr5l,8(rp) + addib,<> -1,n,L(loop) + xmpyu %fr4r,%fr4r,%fr5 + +LDEF(exit) + fstws %fr5r,-4(rp) + bv 0(%r2) + fstws %fr5l,0(rp) +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm new file mode 100644 index 0000000..a9b11d2 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/submul_1.asm @@ -0,0 +1,115 @@ +dnl HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r26 +C s1_ptr r25 +C size r24 +C s2_limb r23 + +C This runs at 12 cycles/limb on a PA7000. With the used instructions, it can +C not become faster due to data cache contention after a store. On the PA7100 +C it runs at 11 cycles/limb. + +C There are some ideas described in mul_1.asm that applies to this code too. + +C It seems possible to make this run as fast as mpn_addmul_1, if we use +C sub,>>= %r29,%r19,%r22 +C addi 1,%r28,%r28 +C but that requires reworking the hairy software pipeline... + +ASM_START() +PROLOGUE(mpn_submul_1) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) C move s2_limb ... + addib,= -1,%r24,L(just_one_limb) + fldws -16(%r30),%fr4 C ... into fr4 + add %r0,%r0,%r0 C clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 C least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L(end) + ldw -12(%r30),%r1 + +C Main loop +LDEF(loop) + ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L(loop) + ldw -12(%r30),%r1 + +LDEF(end) + ldw 0(%r26),%r29 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +LDEF(just_one_limb) + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + sub %r29,%r1,%r22 + add %r22,%r1,%r0 + stw %r22,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm new file mode 100644 index 0000000..626ecd2 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/udiv.asm @@ -0,0 +1,102 @@ +dnl HP-PA __udiv_qrnnd division support, used from longlong.h. +dnl This version runs fast on PA 7000 and later. + +dnl Copyright 1993, 1994, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr gr26 +C n1 gr25 +C n0 gr24 +C d gr23 + +C This file has caused a lot of trouble, since it demands PIC reference to +C static data, which triggers bugs in gas (at least version 2.7 through +C 2.11.2). When the bug is triggered, many bogus relocs are generated. The +C current solution is to stuff data right into the code, and refer it using +C absolute offsets. Fragile to be sure, but nothing else seems to work. + +ASM_START() +ifdef(`PIC',`', +` RODATA + INT64(0000, 0x43f00000, 0x0) C 2^64 +') + +PROLOGUE(mpn_udiv_qrnnd) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + + stws %r25,-16(0,%r30) C n_hi + stws %r24,-12(0,%r30) C n_lo + +ifdef(`PIC', +` bl .+20,%r31 + dep %r0,31,2,%r31 + .word 0x0 C padding for alignment + .word 0x43f00000, 0x0 C 2^64 + ldo 4(%r31),%r31', +` ldil `L'%L(0000),%r31 + ldo R%L(0000)(%r31),%r31') + + fldds -16(0,%r30),%fr5 + stws %r23,-12(0,%r30) + comib,<= 0,%r25,L(1) + fcnvxf,dbl,dbl %fr5,%fr5 + fldds 0(0,%r31),%fr4 + fadd,dbl %fr4,%fr5,%fr5 + +LDEF(1) + fcpy,sgl %fr0,%fr6L + fldws -12(0,%r30),%fr6R + fcnvxf,dbl,dbl %fr6,%fr4 + + fdiv,dbl %fr5,%fr4,%fr5 + + fcnvfx,dbl,dbl %fr5,%fr4 + fstws %fr4R,-16(%r30) + xmpyu %fr4R,%fr6R,%fr6 + ldws -16(%r30),%r28 + fstds %fr6,-16(0,%r30) + ldws -12(0,%r30),%r21 + ldws -16(0,%r30),%r20 + sub %r24,%r21,%r22 + subb %r25,%r20,%r20 + comib,= 0,%r20,L(2) + ldo -64(%r30),%r30 + + add %r22,%r23,%r22 + ldo -1(%r28),%r28 + +LDEF(2) + bv 0(%r2) + stws %r22,0(0,%r26) + +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm new file mode 100644 index 0000000..18b923c --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa1_1/umul.asm @@ -0,0 +1,47 @@ +dnl Copyright 1999, 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) +C .callinfo frame=64,no_calls + + ldo 64(%r30),%r30 + stw %r25,-16(0,%r30) + fldws -16(0,%r30),%fr22R + stw %r24,-16(0,%r30) + fldws -16(0,%r30),%fr22L + xmpyu %fr22R,%fr22L,%fr22 + fstds %fr22,-16(0,%r30) + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + stw %r29,0(0,%r26) + bv 0(%r2) + ldo -64(%r30),%r30 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm b/gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm new file mode 100644 index 0000000..8d881b8 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa2_0/add_n.asm @@ -0,0 +1,107 @@ +dnl HP-PA 2.0 32-bit mpn_add_n -- Add two limb vectors of the same length > 0 +dnl and store sum in a third limb vector. + +dnl Copyright 1997, 1998, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C This runs at 2 cycles/limb on PA8000. + +ASM_START() +PROLOGUE(mpn_add_n) + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 C r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 C r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 C offset s1_ptr + sub %r24,%r22,%r24 C offset s2_ptr + sub %r26,%r22,%r26 C offset res_ptr + blr %r28,%r0 C branch into loop + add %r0,%r0,%r0 C reset carry + +LDEF(loop) + ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,0(%r26) + +LDEF(7) + ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,4(%r26) + +LDEF(6) + ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,8(%r26) + +LDEF(5) + ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,12(%r26) + +LDEF(4) + ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,16(%r26) + +LDEF(3) + ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,20(%r26) + +LDEF(2) + ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,24(%r26) + +LDEF(1) + ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L(loop) + ldo 32(%r26),%r26 + + bv (%r2) + addc %r0,%r0,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h b/gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h new file mode 100644 index 0000000..6016274 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa2_0/gmp-mparam.h @@ -0,0 +1,167 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2009, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 552 MHz PA8600 (gcc61.fsffrance.org) */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 11 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 28 +#define USE_PREINV_DIVREM_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 36 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 202 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102 + +#define SQR_BASECASE_THRESHOLD 7 +#define SQR_TOOM2_THRESHOLD 55 +#define SQR_TOOM3_THRESHOLD 93 +#define SQR_TOOM4_THRESHOLD 250 +#define SQR_TOOM6_THRESHOLD 306 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 244, 5}, { 8, 4}, { 17, 5}, { 13, 6}, \ + { 7, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 24, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \ + { 15, 7}, { 33, 8}, { 23, 9}, { 15, 8}, \ + { 39, 9}, { 23,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47,10}, \ + { 31, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135, 8}, { 271, 9}, { 143,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \ + { 351,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 351, 9}, { 703, 8}, { 1407,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223, 9}, \ + { 895,10}, { 479,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 415,10}, { 831,11}, { 479,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 107 +#define MUL_FFT_THRESHOLD 2112 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 8, 4}, { 17, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 9}, { 15, 8}, \ + { 39, 9}, { 23,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 7}, { 511, 9}, { 135, 8}, { 271, 9}, \ + { 143,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 175, 8}, { 351, 7}, { 703,10}, { 95, 9}, \ + { 191, 8}, { 383, 9}, { 207,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351, 9}, { 703, 8}, { 1407,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223, 8}, \ + { 1791,10}, { 479, 9}, { 959,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \ + { 1407,12}, { 191,11}, { 415,10}, { 831,11}, \ + { 479,10}, { 959,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 109 +#define SQR_FFT_THRESHOLD 1600 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 116 +#define MULLO_MUL_N_THRESHOLD 3574 + +#define DC_DIV_QR_THRESHOLD 100 +#define DC_DIVAPPR_Q_THRESHOLD 348 +#define DC_BDIV_QR_THRESHOLD 109 +#define DC_BDIV_Q_THRESHOLD 254 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 276 +#define INV_APPR_THRESHOLD 276 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_N_THRESHOLD 78 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 263 +#define MUPI_DIV_QR_THRESHOLD 102 +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD_THRESHOLD 100 +#define GCD_DC_THRESHOLD 379 +#define GCDEXT_DC_THRESHOLD 249 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 7 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 270 +#define SET_STR_PRECOMPUTE_THRESHOLD 782 diff --git a/gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm b/gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm new file mode 100644 index 0000000..c55112f --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa2_0/sqr_diagonal.asm @@ -0,0 +1,112 @@ +dnl HP-PA 32-bit mpn_sqr_diagonal optimized for the PA8x00. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This code runs at 6 cycles/limb on the PA7100 and 2 cycles/limb on PA8x00. +C The 2-way unrolling is actually not helping the PA7100. + +C INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + + fldws,ma 4(up),%fr4r + addib,= -1,n,L(end1) + ldo 4(rp),rp + + fldws,ma 4(up),%fr6r + addib,= -1,n,L(end2) + xmpyu %fr4r,%fr4r,%fr5 + + fldws,ma 4(up),%fr4r + addib,= -1,n,L(end3) + xmpyu %fr6r,%fr6r,%fr7 + + +LDEF(loop) + fldws,ma 4(up),%fr6r + fstws %fr5r,-4(rp) + fstws,ma %fr5l,8(rp) + addib,= -1,n,L(exite) + xmpyu %fr4r,%fr4r,%fr5 + fldws,ma 4(up),%fr4r + fstws %fr7r,-4(rp) + fstws,ma %fr7l,8(rp) + addib,<> -1,n,L(loop) + xmpyu %fr6r,%fr6r,%fr7 + +LDEF(exito) + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr7r,4(rp) + fstws %fr7l,8(rp) + fstws,mb %fr5r,12(rp) + bv 0(%r2) + fstws %fr5l,4(rp) + +LDEF(exite) + fstws %fr7r,-4(rp) + fstws %fr7l,0(rp) + xmpyu %fr6r,%fr6r,%fr7 + fstws %fr5r,4(rp) + fstws %fr5l,8(rp) + fstws,mb %fr7r,12(rp) + bv 0(%r2) + fstws %fr7l,4(rp) + +LDEF(end1) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr5r,-4(rp) + bv 0(%r2) + fstws,ma %fr5l,8(rp) + +LDEF(end2) + xmpyu %fr6r,%fr6r,%fr7 + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + fstws %fr7r,4(rp) + bv 0(%r2) + fstws %fr7l,8(rp) + +LDEF(end3) + fstws %fr5r,-4(rp) + fstws %fr5l,0(rp) + xmpyu %fr4r,%fr4r,%fr5 + fstws %fr7r,4(rp) + fstws %fr7l,8(rp) + fstws,mb %fr5r,12(rp) + bv 0(%r2) + fstws %fr5l,4(rp) +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm b/gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm new file mode 100644 index 0000000..47b3163 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/hppa2_0/sub_n.asm @@ -0,0 +1,107 @@ +dnl HP-PA 2.0 32-bit mpn_sub_n -- Subtract two limb vectors of the same +dnl length > 0 and store difference in a third limb vector. + +dnl Copyright 1997, 1998, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C This runs at 2 cycles/limb on PA8000. + +ASM_START() +PROLOGUE(mpn_sub_n) + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 C r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 C r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 C offset s1_ptr + sub %r24,%r22,%r24 C offset s2_ptr + blr %r28,%r0 C branch into loop + sub %r26,%r22,%r26 C offset res_ptr and set carry + +LDEF(loop) + ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,0(%r26) + +LDEF(7) + ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,4(%r26) + +LDEF(6) + ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,8(%r26) + +LDEF(5) + ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,12(%r26) + +LDEF(4) + ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,16(%r26) + +LDEF(3) + ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,20(%r26) + +LDEF(2) + ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,24(%r26) + +LDEF(1) + ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L(loop) + ldo 32(%r26),%r26 + + addc %r0,%r0,%r28 + bv (%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/lshift.asm b/gmp-6.3.0/mpn/pa32/lshift.asm new file mode 100644 index 0000000..5ea497c --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/lshift.asm @@ -0,0 +1,75 @@ +dnl HP-PA mpn_lshift -- Shift a number left. + +dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s_ptr gr25 +C size gr24 +C cnt gr23 + +ASM_START() +PROLOGUE(mpn_lshift) + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L(0004) + vshd %r0,%r22,%r28 C compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,= -1,%r24,L(0002) + vshd %r22,%r29,%r20 + +LDEF(loop) + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,= -1,%r24,L(0003) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,<> -1,%r24,L(loop) + vshd %r22,%r29,%r20 + +LDEF(0002) + stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + +LDEF(0003) + stws,mb %r20,-4(0,%r26) + +LDEF(0004) + vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/pa-defs.m4 b/gmp-6.3.0/mpn/pa32/pa-defs.m4 new file mode 100644 index 0000000..b26e715 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/pa-defs.m4 @@ -0,0 +1,64 @@ +divert(-1) + +dnl m4 macros for HPPA assembler. + +dnl Copyright 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl hppa assembler comments are introduced with ";". +dnl +dnl For cooperation with cpp, apparently lines "# 123" set the line number, +dnl and other lines starting with a "#" are ignored. + +changecom(;) + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl These are the same as the basic PROLOGUE_cpu and EPILOGUE_cpu in +dnl mpn/asm-defs.m4, but using .proc / .procend. These are standard and on +dnl an ELF system they do what .type and .size normally do. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) + `.code + ALIGN(8) + .export `$1',entry +`$1'LABEL_SUFFIX' + .proc + .callinfo) dnl This is really bogus, but allows us to compile + dnl again on hppa machines. + + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .procend') + +divert diff --git a/gmp-6.3.0/mpn/pa32/rshift.asm b/gmp-6.3.0/mpn/pa32/rshift.asm new file mode 100644 index 0000000..c5eac83 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/rshift.asm @@ -0,0 +1,72 @@ +dnl HP-PA mpn_rshift -- Shift a number right. + +dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s_ptr gr25 +C size gr24 +C cnt gr23 + +ASM_START() +PROLOGUE(mpn_rshift) + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L(0004) + vshd %r22,%r0,%r28 C compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,= -1,%r24,L(0002) + vshd %r29,%r22,%r20 + +LDEF(loop) + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,= -1,%r24,L(0003) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,<> -1,%r24,L(loop) + vshd %r29,%r22,%r20 + +LDEF(0002) + stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + +LDEF(0003) + stws,ma %r20,4(0,%r26) + +LDEF(0004) + vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/sub_n.asm b/gmp-6.3.0/mpn/pa32/sub_n.asm new file mode 100644 index 0000000..9c71655 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/sub_n.asm @@ -0,0 +1,64 @@ +dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr gr26 +C s1_ptr gr25 +C s2_ptr gr24 +C size gr23 + +C One might want to unroll this as for other processors, but it turns out that +C the data cache contention after a store makes such unrolling useless. We +C can't come under 5 cycles/limb anyway. + +ASM_START() +PROLOGUE(mpn_sub_n) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L(end) C check for (SIZE == 1) + sub %r20,%r19,%r28 C subtract first limbs ignoring cy + +LDEF(loop) + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L(loop) + subb %r20,%r19,%r28 + +LDEF(end) + stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa32/udiv.asm b/gmp-6.3.0/mpn/pa32/udiv.asm new file mode 100644 index 0000000..addbf41 --- /dev/null +++ b/gmp-6.3.0/mpn/pa32/udiv.asm @@ -0,0 +1,291 @@ +dnl HP-PA __udiv_qrnnd division support, used from longlong.h. +dnl This version runs fast on pre-PA7000 CPUs. + +dnl Copyright 1993, 1994, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr gr26 +C n1 gr25 +C n0 gr24 +C d gr23 + +C The code size is a bit excessive. We could merge the last two ds;addc +C sequences by simply moving the "bb,< Odd" instruction down. The only +C trouble is the FFFFFFFF code that would need some hacking. + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + comb,< %r23,0,L(largedivisor) + sub %r0,%r23,%r1 C clear cy as side-effect + ds %r0,%r1,%r0 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r28 + ds %r25,%r23,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r23,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r28,%r28,%r28 + +LDEF(largedivisor) + extru %r24,31,1,%r19 C r19 = n0 & 1 + bb,< %r23,31,L(odd) + extru %r23,30,31,%r22 C r22 = d >> 1 + shd %r25,%r24,1,%r24 C r24 = new n0 + extru %r25,30,31,%r25 C r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r24,%r24,%r28 + +LDEF(odd) + addib,sv,n 1,%r22,L(FFFFFFFF) C r22 = (d / 2 + 1) + shd %r25,%r24,1,%r24 C r24 = new n0 + extru %r25,30,31,%r25 C r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r28 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 +C We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25 + add,nuv %r28,%r25,%r25 + addl %r25,%r1,%r25 + addc %r0,%r28,%r28 + sub,<< %r25,%r23,%r0 + addl %r25,%r1,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r0,%r28,%r28 + +C This is just a special case of the code above. +C We come here when d == 0xFFFFFFFF +LDEF(FFFFFFFF) + add,uv %r25,%r24,%r24 + sub,<< %r24,%r23,%r0 + ldo 1(%r24),%r24 + stws %r24,0(0,%r26) + bv 0(%r2) + addc %r0,%r25,%r28 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa64/README b/gmp-6.3.0/mpn/pa64/README new file mode 100644 index 0000000..a51ce02 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/README @@ -0,0 +1,78 @@ +Copyright 1999, 2001, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + +This directory contains mpn functions for 64-bit PA-RISC 2.0. + +PIPELINE SUMMARY + +The PA8x00 processors have an orthogonal 4-way out-of-order pipeline. Each +cycle two ALU operations and two MEM operations can issue, but just one of the +MEM operations may be a store. The two ALU operations can be almost any +combination of non-memory operations. Unlike every other processor, integer +and fp operations are completely equal here; they both count as just ALU +operations. + +Unfortunately, some operations cause hickups in the pipeline. Combining +carry-consuming operations like ADD,DC with operations that does not set carry +like ADD,L cause long delays. Skip operations also seem to cause hickups. If +several ADD,DC are issued consecutively, or if plain carry-generating ADD feed +ADD,DC, stalling does not occur. We can effectively issue two ADD,DC +operations/cycle. + +Latency scheduling is not as important as making sure to have a mix of ALU and +MEM operations, but for full pipeline utilization, it is still a good idea to +do some amount of latency scheduling. + +Like for all other processors, RAW memory scheduling is critically important. +Since integer multiplication takes place in the floating-point unit, the GMP +code needs to handle this problem frequently. + +STATUS + +* mpn_lshift and mpn_rshift run at 1.5 cycles/limb on PA8000 and at 1.0 + cycles/limb on PA8500. With latency scheduling, the numbers could + probably be improved to 1.0 cycles/limb for all PA8x00 chips. + +* mpn_add_n and mpn_sub_n run at 2.0 cycles/limb on PA8000 and at about + 1.6875 cycles/limb on PA8500. With latency scheduling, this could + probably be improved to get close to 1.5 cycles/limb. A problem is the + stalling of carry-inputting instructions after instructions that do not + write to carry. + +* mpn_mul_1, mpn_addmul_1, and mpn_submul_1 run at between 5.625 and 6.375 + on PA8500 and later, and about a cycle/limb slower on older chips. The + code uses ADD,DC for adjacent limbs, and relies heavily on reordering. + + +REFERENCES + +Hewlett Packard, "64-Bit Runtime Architecture for PA-RISC 2.0", version 3.3, +October 1997. diff --git a/gmp-6.3.0/mpn/pa64/addmul_1.asm b/gmp-6.3.0/mpn/pa64/addmul_1.asm new file mode 100644 index 0000000..2cb9af9 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/addmul_1.asm @@ -0,0 +1,693 @@ +dnl HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 8000,8200: 7 +C 8500,8600,8700: 6.375 + +C The feed-in and wind-down code has not yet been scheduled. Many cycles +C could be saved there per call. + +C DESCRIPTION: +C The main loop "BIG" is 4-way unrolled, mainly to allow +C effective use of ADD,DC. Delays in moving data via the cache from the FP +C registers to the IU registers, have demanded a deep software pipeline, and +C a lot of stack slots for partial products in flight. +C +C CODE STRUCTURE: +C save-some-registers +C do 0, 1, 2, or 3 limbs +C if done, restore-some-regs and return +C save-many-regs +C do 4, 8, ... limb +C restore-all-regs + +C STACK LAYOUT: +C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the +C slots marked FREE, as well as some slots in the caller's "frame marker". +C +C -00 <- r30 +C -08 FREE +C -10 tmp +C -18 tmp +C -20 tmp +C -28 tmp +C -30 tmp +C -38 tmp +C -40 tmp +C -48 tmp +C -50 tmp +C -58 tmp +C -60 tmp +C -68 tmp +C -70 tmp +C -78 tmp +C -80 tmp +C -88 tmp +C -90 FREE +C -98 FREE +C -a0 FREE +C -a8 FREE +C -b0 r13 +C -b8 r12 +C -c0 r11 +C -c8 r10 +C -d0 r8 +C -d8 r8 +C -e0 r7 +C -e8 r6 +C -f0 r5 +C -f8 r4 +C -100 r3 +C Previous frame: +C [unused area] +C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here. + + +include(`../config.m4') + +C INPUT PARAMETERS: +define(`rp',`%r26') C +define(`up',`%r25') C +define(`n',`%r24') C +define(`vlimb',`%r23') C + +define(`climb',`%r23') C + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_addmul_1) + +ifdef(`HAVE_ABI_2_0w', +` std vlimb, -0x38(%r30) C store vlimb into "home" slot +') + std,ma %r3, 0x100(%r30) + std %r4, -0xf8(%r30) + std %r5, -0xf0(%r30) + ldo 0(%r0), climb C clear climb + fldd -0x138(%r30), %fr8 C put vlimb in fp register + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C + +define(`m032',`%r20') C +define(`m096',`%r21') C + +define(`p000a',`%r22') C +define(`p064a',`%r29') C + +define(`s000',`%r31') C + +define(`ma000',`%r4') C +define(`ma064',`%r20') C + +define(`r000',`%r3') C + + extrd,u n, 63, 2, %r5 + cmpb,= %r5, %r0, L(BIG) + nop + + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + addib,<> -1, %r5, L(two_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(one) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x80(%r30), p000a + b L(0_one_out) + ldd -0x68(%r30), p064a + +LDEF(two_or_more) + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + ldd -0x68(%r30), p064a + addib,<> -1, %r5, L(three_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(two) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + b L(0_two_out) + depd m096, 31, 32, ma064 + +LDEF(three_or_more) + fldd 0(up), %fr4 + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 +C addib,= -1, %r5, L(0_out) + depd m096, 31, 32, ma064 +LDEF(loop0) +C xmpyu %fr8R, %fr4L, %fr22 +C xmpyu %fr8L, %fr4R, %fr23 +C ldd -0x78(%r30), p032a1 +C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 +C +C xmpyu %fr8R, %fr4R, %fr24 +C xmpyu %fr8L, %fr4L, %fr25 +C ldd -0x70(%r30), p032a2 +C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 +C +C ldo 8(rp), rp +C add climb, p000a, s000 +C ldd -0x80(%r30), p000a +C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 +C +C add,dc p064a, %r0, climb +C ldo 8(up), up +C ldd -0x68(%r30), p064a +C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +C +C add ma000, s000, s000 +C add,dc ma064, climb, climb +C fldd 0(up), %fr4 +C +C add r000, s000, s000 +C add,dc %r0, climb, climb +C std s000, -8(rp) +C +C add p032a1, p032a2, m032 +C add,dc %r0, %r0, m096 +C +C depd,z m032, 31, 32, ma000 +C extrd,u m032, 31, 32, ma064 +C ldd 0(rp), r000 +C addib,<> -1, %r5, L(loop0) +C depd m096, 31, 32, ma064 +LDEF(0_out) + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 + add ma000, s000, s000 + add,dc ma064, climb, climb + add r000, s000, s000 + add,dc %r0, climb, climb + std s000, -8(rp) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + depd m096, 31, 32, ma064 +LDEF(0_two_out) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + add ma000, s000, s000 + add,dc ma064, climb, climb + add r000, s000, s000 + add,dc %r0, climb, climb + std s000, -8(rp) +LDEF(0_one_out) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + depd m096, 31, 32, ma064 + + add climb, p000a, s000 + add,dc p064a, %r0, climb + add ma000, s000, s000 + add,dc ma064, climb, climb + add r000, s000, s000 + add,dc %r0, climb, climb + std s000, 0(rp) + + cmpib,>= 4, n, L(done) + ldo 8(rp), rp + +C 4-way unrolled code. + +LDEF(BIG) + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C +define(`p096b1',`%r20') C +define(`p096b2',`%r21') C +define(`p160c1',`%r22') C +define(`p160c2',`%r29') C +define(`p224d1',`%r31') C +define(`p224d2',`%r3') C + C +define(`m032',`%r4') C +define(`m096',`%r5') C +define(`m160',`%r6') C +define(`m224',`%r7') C +define(`m288',`%r8') C + C +define(`p000a',`%r1') C +define(`p064a',`%r19') C +define(`p064b',`%r20') C +define(`p128b',`%r21') C +define(`p128c',`%r22') C +define(`p192c',`%r29') C +define(`p192d',`%r31') C +define(`p256d',`%r3') C + C +define(`s000',`%r10') C +define(`s064',`%r11') C +define(`s128',`%r12') C +define(`s192',`%r13') C + C +define(`ma000',`%r9') C +define(`ma064',`%r4') C +define(`ma128',`%r5') C +define(`ma192',`%r6') C +define(`ma256',`%r7') C + C +define(`r000',`%r1') C +define(`r064',`%r19') C +define(`r128',`%r20') C +define(`r192',`%r21') C + + std %r6, -0xe8(%r30) + std %r7, -0xe0(%r30) + std %r8, -0xd8(%r30) + std %r9, -0xd0(%r30) + std %r10, -0xc8(%r30) + std %r11, -0xc0(%r30) + std %r12, -0xb8(%r30) + std %r13, -0xb0(%r30) + +ifdef(`HAVE_ABI_2_0w', +` extrd,u n, 61, 62, n C right shift 2 +',` extrd,u n, 61, 30, n C right shift 2, zero extend +') + +LDEF(4_or_more) + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,<> -1, n, L(8_or_more) + xmpyu %fr8L, %fr7L, %fr27 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + b L(end1) + nop + +LDEF(8_or_more) + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,= -1, n, L(end2) + xmpyu %fr8L, %fr7L, %fr27 +LDEF(loop) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + + add,dc ma128, s128, s128 C accum mid 2 + fldd 0(up), %fr4 + add,dc ma192, s192, s192 C accum mid 3 + fldd 8(up), %fr5 + + add,dc ma256, climb, climb + fldd 16(up), %fr6 + add r000, s000, s000 C accum rlimb 0 + fldd 24(up), %fr7 + + add,dc r064, s064, s064 C accum rlimb 1 + add,dc r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + + add,dc r192, s192, s192 C accum rlimb 3 + add,dc %r0, climb, climb + std s064, 8(rp) + + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + std s128, 16(rp) + + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + std s192, 24(rp) + + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + xmpyu %fr8L, %fr7L, %fr27 + + addib,<> -1, n, L(loop) + ldo 32(rp), rp + +LDEF(end2) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + add r000, s000, s000 C accum rlimb 0 + add,dc r064, s064, s064 C accum rlimb 1 + add,dc r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + add,dc r192, s192, s192 C accum rlimb 3 + add,dc %r0, climb, climb + std s064, 8(rp) + ldd -0x78(%r30), p032a1 + std s128, 16(rp) + ldd -0x70(%r30), p032a2 + std s192, 24(rp) + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + ldo 32(rp), rp + +LDEF(end1) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + add r000, s000, s000 C accum rlimb 0 + add,dc r064, s064, s064 C accum rlimb 1 + add,dc r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + add,dc r192, s192, s192 C accum rlimb 3 + add,dc %r0, climb, climb + std s064, 8(rp) + std s128, 16(rp) + std s192, 24(rp) + + ldd -0xb0(%r30), %r13 + ldd -0xb8(%r30), %r12 + ldd -0xc0(%r30), %r11 + ldd -0xc8(%r30), %r10 + ldd -0xd0(%r30), %r9 + ldd -0xd8(%r30), %r8 + ldd -0xe0(%r30), %r7 + ldd -0xe8(%r30), %r6 +LDEF(done) +ifdef(`HAVE_ABI_2_0w', +` copy climb, %r28 +',` extrd,u climb, 63, 32, %r29 + extrd,u climb, 31, 32, %r28 +') + ldd -0xf0(%r30), %r5 + ldd -0xf8(%r30), %r4 + bve (%r2) + ldd,mb -0x100(%r30), %r3 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/pa64/aors_n.asm b/gmp-6.3.0/mpn/pa64/aors_n.asm new file mode 100644 index 0000000..ab4536f --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/aors_n.asm @@ -0,0 +1,130 @@ +dnl HP-PA 2.0 mpn_add_n, mpn_sub_n + +dnl Copyright 1997, 2000, 2002, 2003, 2009, 2010 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl This runs at 2 cycles/limb on PA8000 and 1.6875 cycles/limb on PA8500. It +dnl should be possible to reach the cache bandwidth 1.5 cycles/limb at least +dnl with PA8500. The problem now is stalling of the first ADD,DC after LDO, +dnl where the processor gets confused about where carry comes from. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`vp',`%r24') +define(`n',`%r23') + +ifdef(`OPERATION_add_n', ` + define(ADCSBC, `add,dc') + define(INITCY, `addi -1,%r22,%r0') + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBC, `sub,db') + define(INITCY, `subi 0,%r22,%r0') + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(func_nc) +ifdef(`HAVE_ABI_2_0w', +` b L(com) + nop +',` b L(com) + ldw -52(%r30), %r22 +') +EPILOGUE() +PROLOGUE(func) + ldi 0, %r22 +LDEF(com) + sub %r0, n, %r21 + depw,z %r21, 30, 3, %r28 C r28 = 2 * (-n & 7) + depw,z %r21, 28, 3, %r21 C r21 = 8 * (-n & 7) + sub up, %r21, up C offset up + sub vp, %r21, vp C offset vp + sub rp, %r21, rp C offset rp + blr %r28, %r0 C branch into loop + INITCY + +LDEF(loop) + ldd 0(up), %r20 + ldd 0(vp), %r31 + ADCSBC %r20, %r31, %r20 + std %r20, 0(rp) +LDEF(7) ldd 8(up), %r21 + ldd 8(vp), %r19 + ADCSBC %r21, %r19, %r21 + std %r21, 8(rp) +LDEF(6) ldd 16(up), %r20 + ldd 16(vp), %r31 + ADCSBC %r20, %r31, %r20 + std %r20, 16(rp) +LDEF(5) ldd 24(up), %r21 + ldd 24(vp), %r19 + ADCSBC %r21, %r19, %r21 + std %r21, 24(rp) +LDEF(4) ldd 32(up), %r20 + ldd 32(vp), %r31 + ADCSBC %r20, %r31, %r20 + std %r20, 32(rp) +LDEF(3) ldd 40(up), %r21 + ldd 40(vp), %r19 + ADCSBC %r21, %r19, %r21 + std %r21, 40(rp) +LDEF(2) ldd 48(up), %r20 + ldd 48(vp), %r31 + ADCSBC %r20, %r31, %r20 + std %r20, 48(rp) +LDEF(1) ldd 56(up), %r21 + ldd 56(vp), %r19 + ADCSBC %r21, %r19, %r21 + ldo 64(up), up + std %r21, 56(rp) + ldo 64(vp), vp + addib,> -8, n, L(loop) + ldo 64(rp), rp + + add,dc %r0, %r0, %r29 +ifdef(`OPERATION_sub_n',` + subi 1, %r29, %r29 +') + bve (%r2) +ifdef(`HAVE_ABI_2_0w', +` copy %r29, %r28 +',` ldi 0, %r28 +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa64/aorslsh1_n.asm b/gmp-6.3.0/mpn/pa64/aorslsh1_n.asm new file mode 100644 index 0000000..2a55dde --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/aorslsh1_n.asm @@ -0,0 +1,228 @@ +dnl PA64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1). + +dnl Copyright 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 8000,8200: 2 +C 8500,8600,8700: 1.75 + +C TODO +C * Write special feed-in code for each (n mod 8). (See the ia64 code.) +C * Try to make this run at closer to 1.5 c/l. +C * Set up register aliases (define(`u0',`%r19')). +C * Explicitly align loop. + +dnl INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`vp',`%r24') +define(`n',`%r23') + +ifdef(`OPERATION_addlsh1_n',` + define(ADCSBC, `add,dc') + define(INITC, `ldi 0,') + define(func, mpn_addlsh1_n) +') +ifdef(`OPERATION_sublsh1_n',` + define(ADCSBC, `sub,db') + define(INITC, `ldi 1,') + define(func, mpn_sublsh1_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + +ifdef(`HAVE_ABI_2_0w',` + define(LEVEL, `.level 2.0w') + define(RETREG, `%r28') + define(CLRRET1, `dnl') +') +ifdef(`HAVE_ABI_2_0n',` + define(LEVEL, `.level 2.0') + define(RETREG, `%r29') + define(CLRRET1, `ldi 0, %r28') +') + + LEVEL +PROLOGUE(func) + std,ma %r3, 0x100(%r30) C save reg + + INITC %r1 C init saved cy + +C Primitive code for the first (n mod 8) limbs: + extrd,u n, 63, 3, %r22 C count for loop0 + comib,= 0, %r22, L(unrolled) C skip loop0? + copy %r0, %r28 +LDEF(loop0) + ldd 0(vp), %r21 + ldo 8(vp), vp + ldd 0(up), %r19 + ldo 8(up), up + shrpd %r21, %r28, 63, %r31 + addi -1, %r1, %r0 C restore cy + ADCSBC %r19, %r31, %r29 + std %r29, 0(rp) + add,dc %r0, %r0, %r1 C save cy + copy %r21, %r28 + addib,> -1, %r22, L(loop0) + ldo 8(rp), rp + + addib,>= -8, n, L(unrolled) + addi -1, %r1, %r0 C restore cy + + shrpd %r0, %r28, 63, %r28 + ADCSBC %r0, %r28, RETREG +ifdef(`OPERATION_sublsh1_n', +` sub %r0, RETREG, RETREG') + CLRRET1 + + bve (%r2) + ldd,mb -0x100(%r30), %r3 + + +LDEF(unrolled) + std %r4, -0xf8(%r30) C save reg + ldd 0(vp), %r4 + std %r5, -0xf0(%r30) C save reg + ldd 8(vp), %r5 + std %r6, -0xe8(%r30) C save reg + ldd 16(vp), %r6 + std %r7, -0xe0(%r30) C save reg + + ldd 24(vp), %r7 + shrpd %r4, %r28, 63, %r31 + std %r8, -0xd8(%r30) C save reg + ldd 32(vp), %r8 + shrpd %r5, %r4, 63, %r4 + std %r9, -0xd0(%r30) C save reg + ldd 40(vp), %r9 + shrpd %r6, %r5, 63, %r5 + ldd 48(vp), %r3 + shrpd %r7, %r6, 63, %r6 + ldd 56(vp), %r28 + shrpd %r8, %r7, 63, %r7 + ldd 0(up), %r19 + shrpd %r9, %r8, 63, %r8 + ldd 8(up), %r20 + shrpd %r3, %r9, 63, %r9 + ldd 16(up), %r21 + shrpd %r28, %r3, 63, %r3 + ldd 24(up), %r22 + + nop C alignment FIXME + addib,<= -8, n, L(end) + addi -1, %r1, %r0 C restore cy +LDEF(loop) + ADCSBC %r19, %r31, %r29 + ldd 32(up), %r19 + std %r29, 0(rp) + ADCSBC %r20, %r4, %r29 + ldd 40(up), %r20 + std %r29, 8(rp) + ADCSBC %r21, %r5, %r29 + ldd 48(up), %r21 + std %r29, 16(rp) + ADCSBC %r22, %r6, %r29 + ldd 56(up), %r22 + std %r29, 24(rp) + ADCSBC %r19, %r7, %r29 + ldd 64(vp), %r4 + std %r29, 32(rp) + ADCSBC %r20, %r8, %r29 + ldd 72(vp), %r5 + std %r29, 40(rp) + ADCSBC %r21, %r9, %r29 + ldd 80(vp), %r6 + std %r29, 48(rp) + ADCSBC %r22, %r3, %r29 + std %r29, 56(rp) + + add,dc %r0, %r0, %r1 C save cy + + ldd 88(vp), %r7 + shrpd %r4, %r28, 63, %r31 + ldd 96(vp), %r8 + shrpd %r5, %r4, 63, %r4 + ldd 104(vp), %r9 + shrpd %r6, %r5, 63, %r5 + ldd 112(vp), %r3 + shrpd %r7, %r6, 63, %r6 + ldd 120(vp), %r28 + shrpd %r8, %r7, 63, %r7 + ldd 64(up), %r19 + shrpd %r9, %r8, 63, %r8 + ldd 72(up), %r20 + shrpd %r3, %r9, 63, %r9 + ldd 80(up), %r21 + shrpd %r28, %r3, 63, %r3 + ldd 88(up), %r22 + + ldo 64(vp), vp + ldo 64(rp), rp + ldo 64(up), up + addib,> -8, n, L(loop) + addi -1, %r1, %r0 C restore cy +LDEF(end) + ADCSBC %r19, %r31, %r29 + ldd 32(up), %r19 + std %r29, 0(rp) + ADCSBC %r20, %r4, %r29 + ldd 40(up), %r20 + std %r29, 8(rp) + ADCSBC %r21, %r5, %r29 + ldd 48(up), %r21 + std %r29, 16(rp) + ADCSBC %r22, %r6, %r29 + ldd 56(up), %r22 + std %r29, 24(rp) + ADCSBC %r19, %r7, %r29 + ldd -0xf8(%r30), %r4 C restore reg + std %r29, 32(rp) + ADCSBC %r20, %r8, %r29 + ldd -0xf0(%r30), %r5 C restore reg + std %r29, 40(rp) + ADCSBC %r21, %r9, %r29 + ldd -0xe8(%r30), %r6 C restore reg + std %r29, 48(rp) + ADCSBC %r22, %r3, %r29 + ldd -0xe0(%r30), %r7 C restore reg + std %r29, 56(rp) + + shrpd %r0, %r28, 63, %r28 + ldd -0xd8(%r30), %r8 C restore reg + ADCSBC %r0, %r28, RETREG +ifdef(`OPERATION_sublsh1_n', +` sub %r0, RETREG, RETREG') + CLRRET1 + + ldd -0xd0(%r30), %r9 C restore reg + bve (%r2) + ldd,mb -0x100(%r30), %r3 C restore reg +EPILOGUE() diff --git a/gmp-6.3.0/mpn/pa64/gmp-mparam.h b/gmp-6.3.0/mpn/pa64/gmp-mparam.h new file mode 100644 index 0000000..c2719c3 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/gmp-mparam.h @@ -0,0 +1,247 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 440MHz PA8200 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD 21 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 31 +#define MUL_TOOM33_THRESHOLD 114 +#define MUL_TOOM44_THRESHOLD 179 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 296 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 130 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 229 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 54 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 58 +#define SQR_TOOM3_THRESHOLD 153 +#define SQR_TOOM4_THRESHOLD 278 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 56 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define POWM_SEC_TABLE 2,23,228,1084 + +#define MUL_FFT_MODF_THRESHOLD 336 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 336, 5}, { 11, 4}, { 23, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \ + { 63, 9}, { 127,10}, { 71, 8}, { 287,10}, \ + { 79,11}, { 47,10}, { 95, 9}, { 191, 8}, \ + { 383, 7}, { 767,10}, { 103, 9}, { 207, 8}, \ + { 415, 7}, { 831,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 543, 7}, { 1087, 6}, \ + { 2175,10}, { 143, 9}, { 287, 8}, { 575,11}, \ + { 79, 9}, { 319, 8}, { 639, 7}, { 1279, 9}, \ + { 335, 8}, { 671,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415, 8}, { 831, 7}, \ + { 1663,11}, { 111,10}, { 223, 9}, { 447, 8}, \ + { 895,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 543, 8}, { 1087, 7}, { 2175,10}, { 287, 9}, \ + { 575, 8}, { 1215, 7}, { 2431,10}, { 319, 9}, \ + { 639, 8}, { 1279,10}, { 335, 9}, { 671, 8}, \ + { 1343, 9}, { 703, 8}, { 1407,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207, 9}, { 831, 8}, \ + { 1663,11}, { 223,10}, { 447, 9}, { 959,13}, \ + { 63,12}, { 127,11}, { 255, 8}, { 2047,11}, \ + { 271,10}, { 543, 9}, { 1087, 8}, { 2175,11}, \ + { 287,10}, { 575, 9}, { 1215, 8}, { 2431,11}, \ + { 319,10}, { 671, 9}, { 1343, 8}, { 2687,11}, \ + { 351,10}, { 703, 9}, { 1471, 8}, { 2943,12}, \ + { 191,11}, { 383, 8}, { 3071,11}, { 415,10}, \ + { 831, 9}, { 1663,11}, { 479,10}, { 959, 9}, \ + { 1919, 8}, { 3839,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087, 9}, { 2175,12}, { 287,11}, \ + { 607,10}, { 1215, 9}, { 2431, 8}, { 4863,12}, \ + { 319,11}, { 671,10}, { 1343,13}, { 191, 9}, \ + { 3071,12}, { 415,11}, { 831,10}, { 1663, 8}, \ + { 6655, 9}, { 3455,12}, { 447, 9}, { 3583,13}, \ + { 255,12}, { 511,11}, { 1023,10}, { 2175,13}, \ + { 319,11}, { 1279,12}, { 671,10}, { 2815,12}, \ + { 735,10}, { 2943, 9}, { 5887,13}, { 383,12}, \ + { 767,11}, { 1535,10}, { 3071,13}, { 447,10}, \ + { 3583,12}, { 959,13}, { 511,12}, { 1087,13}, \ + { 639,12}, { 1343,13}, { 767,11}, { 3071,13}, \ + { 831,12}, { 1663,11}, { 3455,10}, { 6911,13}, \ + { 895,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2303,13}, { 1215,12}, { 2431,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,11}, { 5375,13}, { 1407,12}, { 2815,11}, \ + { 5631,12}, { 2943,13}, { 1535,12}, { 3199,13}, \ + { 1663,12}, { 3327,13}, { 1727,14}, { 895,13}, \ + { 1791,12}, { 3583,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2047,12}, { 4095,14}, { 1151,13}, \ + { 2431,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,12}, { 5631,15}, { 767,14}, { 1535,13}, \ + { 3071,14}, { 1663,13}, { 3327,14}, { 1791,13}, \ + { 3583,14}, { 1919,15}, { 1023,14}, { 2303,13}, \ + { 4607,14}, { 2431,13}, { 4863,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 252 +#define MUL_FFT_THRESHOLD 2368 + +#define SQR_FFT_MODF_THRESHOLD 284 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 284, 5}, { 9, 4}, { 21, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 8}, { 255, 7}, { 511,10}, \ + { 71, 8}, { 287, 7}, { 575,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191, 8}, { 383, 7}, \ + { 767,10}, { 103, 9}, { 207, 8}, { 415,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 543, 7}, { 1087, 8}, { 575, 7}, { 1151,11}, \ + { 79, 8}, { 639, 7}, { 1279, 9}, { 335, 8}, \ + { 671, 7}, { 1343,10}, { 175, 8}, { 703, 7}, \ + { 1407,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415, 8}, { 831, 7}, \ + { 1663, 9}, { 447, 8}, { 895,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 543, 8}, { 1087, 7}, \ + { 2175, 9}, { 575, 8}, { 1151,10}, { 303, 9}, \ + { 607, 8}, { 1215, 7}, { 2431,10}, { 319, 9}, \ + { 639, 8}, { 1279, 9}, { 671, 8}, { 1343, 7}, \ + { 2687,10}, { 351, 9}, { 703, 8}, { 1407,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831, 8}, { 1663,11}, \ + { 223,10}, { 447, 9}, { 895,13}, { 63,11}, \ + { 255,10}, { 543, 8}, { 2175,11}, { 287,10}, \ + { 575, 9}, { 1151,10}, { 607, 9}, { 1215, 8}, \ + { 2431,11}, { 319, 9}, { 1279,10}, { 671, 9}, \ + { 1343, 8}, { 2687,11}, { 351,10}, { 703, 9}, \ + { 1407,10}, { 735,12}, { 191,11}, { 383,10}, \ + { 831, 9}, { 1663,12}, { 223,11}, { 447,10}, \ + { 895,11}, { 479, 9}, { 1919, 8}, { 3839,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087, 9}, { 2175,12}, { 287,11}, { 575,10}, \ + { 1151,11}, { 607,10}, { 1215, 9}, { 2431, 8}, \ + { 4863,10}, { 1279,11}, { 671,10}, { 1343, 9}, \ + { 2687,12}, { 351,11}, { 703,10}, { 1407,11}, \ + { 735,13}, { 191, 9}, { 3071, 7}, { 12287,11}, \ + { 799,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447, 8}, { 7167,12}, { 479, 9}, { 3839,14}, \ + { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,10}, { 2175, 9}, { 4607,11}, { 1215,10}, \ + { 2431,11}, { 1279,10}, { 2559,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,10}, { 3199, 9}, \ + { 6399,12}, { 895,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,13}, { 575,12}, { 1151,10}, \ + { 4607,13}, { 639,12}, { 1279,11}, { 2687,14}, \ + { 383,13}, { 767,11}, { 3071,12}, { 1599,13}, \ + { 895,12}, { 1791,11}, { 3583,13}, { 959,15}, \ + { 255,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \ + { 1471,11}, { 5887,14}, { 767,13}, { 1535,12}, \ + { 3071,13}, { 1599,12}, { 3199,13}, { 1663,12}, \ + { 3327,13}, { 1727,14}, { 895,13}, { 1791,12}, \ + { 3583,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,12}, { 4607,13}, { 2431,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,13}, \ + { 3199,14}, { 1663,13}, { 3327,14}, { 1791,13}, \ + { 3583,14}, { 1919,15}, { 1023,14}, { 2047,13}, \ + { 4095,14}, { 2303,13}, { 4607,14}, { 2431,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 257 +#define SQR_FFT_THRESHOLD 1856 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 113 +#define MULLO_MUL_N_THRESHOLD 4658 + +#define DC_DIV_QR_THRESHOLD 123 +#define DC_DIVAPPR_Q_THRESHOLD 372 +#define DC_BDIV_QR_THRESHOLD 142 +#define DC_BDIV_Q_THRESHOLD 312 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 315 +#define INV_APPR_THRESHOLD 315 + +#define BINV_NEWTON_THRESHOLD 360 +#define REDC_1_TO_REDC_N_THRESHOLD 101 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 93 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define MATRIX22_STRASSEN_THRESHOLD 9 +#define HGCD_THRESHOLD 234 +#define HGCD_APPR_THRESHOLD 300 +#define HGCD_REDUCE_THRESHOLD 1553 +#define GCD_DC_THRESHOLD 684 +#define GCDEXT_DC_THRESHOLD 525 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 21 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 1951 +#define SET_STR_PRECOMPUTE_THRESHOLD 4034 diff --git a/gmp-6.3.0/mpn/pa64/lshift.asm b/gmp-6.3.0/mpn/pa64/lshift.asm new file mode 100644 index 0000000..c0fc292 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/lshift.asm @@ -0,0 +1,114 @@ +dnl HP-PA 2.0 mpn_lshift -- Left shift. + +dnl Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') +define(`cnt',`%r23') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_lshift) + shladd n, 3, up, up + shladd n, 3, rp, rp + subi 64, cnt, cnt + mtsar cnt + ldd -8(up), %r21 + addib,= -1, n, L(end) + shrpd %r0, %r21, %sar, %r29 C compute carry out limb + depw,z n, 31, 3, %r28 C r28 = (size & 7) + sub %r0, n, %r22 + depw,z %r22, 28, 3, %r22 C r22 = 8 * (-size & 7) + add up, %r22, up C offset up + blr %r28, %r0 C branch into jump table + add rp, %r22, rp C offset rp + b L(0) + nop + b L(1) + copy %r21, %r20 + b L(2) + nop + b L(3) + copy %r21, %r20 + b L(4) + nop + b L(5) + copy %r21, %r20 + b L(6) + nop + b L(7) + copy %r21, %r20 + +LDEF(loop) +LDEF(0) ldd -16(up), %r20 + shrpd %r21, %r20, %sar, %r21 + std %r21, -8(rp) +LDEF(7) ldd -24(up), %r21 + shrpd %r20, %r21, %sar, %r20 + std %r20, -16(rp) +LDEF(6) ldd -32(up), %r20 + shrpd %r21, %r20, %sar, %r21 + std %r21, -24(rp) +LDEF(5) ldd -40(up), %r21 + shrpd %r20, %r21, %sar, %r20 + std %r20, -32(rp) +LDEF(4) ldd -48(up), %r20 + shrpd %r21, %r20, %sar, %r21 + std %r21, -40(rp) +LDEF(3) ldd -56(up), %r21 + shrpd %r20, %r21, %sar, %r20 + std %r20, -48(rp) +LDEF(2) ldd -64(up), %r20 + shrpd %r21, %r20, %sar, %r21 + std %r21, -56(rp) +LDEF(1) ldd -72(up), %r21 + ldo -64(up), up + shrpd %r20, %r21, %sar, %r20 + std %r20, -64(rp) + addib,> -8, n, L(loop) + ldo -64(rp), rp + +LDEF(end) + shrpd %r21, %r0, %sar, %r21 + std %r21, -8(rp) + bve (%r2) +ifdef(`HAVE_ABI_2_0w', +` copy %r29,%r28 +',` extrd,u %r29, 31, 32, %r28 +') +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/pa64/mul_1.asm b/gmp-6.3.0/mpn/pa64/mul_1.asm new file mode 100644 index 0000000..6935c23 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/mul_1.asm @@ -0,0 +1,646 @@ +dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 8000,8200: 6.5 +C 8500,8600,8700: 5.625 + +C The feed-in and wind-down code has not yet been scheduled. Many cycles +C could be saved there per call. + +C DESCRIPTION: +C The main loop "BIG" is 4-way unrolled, mainly to allow +C effective use of ADD,DC. Delays in moving data via the cache from the FP +C registers to the IU registers, have demanded a deep software pipeline, and +C a lot of stack slots for partial products in flight. +C +C CODE STRUCTURE: +C save-some-registers +C do 0, 1, 2, or 3 limbs +C if done, restore-some-regs and return +C save-many-regs +C do 4, 8, ... limb +C restore-all-regs + +C STACK LAYOUT: +C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the +C slots marked FREE, as well as some slots in the caller's "frame marker". +C +C -00 <- r30 +C -08 FREE +C -10 tmp +C -18 tmp +C -20 tmp +C -28 tmp +C -30 tmp +C -38 tmp +C -40 tmp +C -48 tmp +C -50 tmp +C -58 tmp +C -60 tmp +C -68 tmp +C -70 tmp +C -78 tmp +C -80 tmp +C -88 tmp +C -90 FREE +C -98 FREE +C -a0 FREE +C -a8 FREE +C -b0 r13 +C -b8 r12 +C -c0 r11 +C -c8 r10 +C -d0 r8 +C -d8 r8 +C -e0 r7 +C -e8 r6 +C -f0 r5 +C -f8 r4 +C -100 r3 +C Previous frame: +C [unused area] +C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here. + + +include(`../config.m4') + +C INPUT PARAMETERS: +define(`rp',`%r26') C +define(`up',`%r25') C +define(`n',`%r24') C +define(`vlimb',`%r23') C + +define(`climb',`%r23') C + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_mul_1) + +ifdef(`HAVE_ABI_2_0w', +` std vlimb, -0x38(%r30) C store vlimb into "home" slot +') + std,ma %r3, 0x100(%r30) + std %r4, -0xf8(%r30) + std %r5, -0xf0(%r30) + ldo 0(%r0), climb C clear climb + fldd -0x138(%r30), %fr8 C put vlimb in fp register + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C + +define(`m032',`%r20') C +define(`m096',`%r21') C + +define(`p000a',`%r22') C +define(`p064a',`%r29') C + +define(`s000',`%r31') C + +define(`ma000',`%r4') C +define(`ma064',`%r20') C + +C define(`r000',`%r3') C FIXME don't save r3 for n < 4. + + extrd,u n, 63, 2, %r5 + cmpb,= %r5, %r0, L(BIG) + nop + + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + addib,<> -1, %r5, L(two_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(one) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x80(%r30), p000a + b L(0_one_out) + ldd -0x68(%r30), p064a + +LDEF(two_or_more) + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + ldd -0x68(%r30), p064a + addib,<> -1, %r5, L(three_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(two) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + b L(0_two_out) + depd m096, 31, 32, ma064 + +LDEF(three_or_more) + fldd 0(up), %fr4 + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 +C addib,= -1, %r5, L(0_out) + depd m096, 31, 32, ma064 +LDEF(loop0) +C xmpyu %fr8R, %fr4L, %fr22 +C xmpyu %fr8L, %fr4R, %fr23 +C ldd -0x78(%r30), p032a1 +C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 +C +C xmpyu %fr8R, %fr4R, %fr24 +C xmpyu %fr8L, %fr4L, %fr25 +C ldd -0x70(%r30), p032a2 +C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 +C +C ldo 8(rp), rp +C add climb, p000a, s000 +C ldd -0x80(%r30), p000a +C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 +C +C add,dc p064a, %r0, climb +C ldo 8(up), up +C ldd -0x68(%r30), p064a +C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +C +C add ma000, s000, s000 +C add,dc ma064, climb, climb +C fldd 0(up), %fr4 +C +C std s000, -8(rp) +C +C add p032a1, p032a2, m032 +C add,dc %r0, %r0, m096 +C +C depd,z m032, 31, 32, ma000 +C extrd,u m032, 31, 32, ma064 +C addib,<> -1, %r5, L(loop0) +C depd m096, 31, 32, ma064 +LDEF(0_out) + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 + add ma000, s000, s000 + add,dc ma064, climb, climb + std s000, -8(rp) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 +LDEF(0_two_out) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + add ma000, s000, s000 + add,dc ma064, climb, climb + std s000, -8(rp) +LDEF(0_one_out) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 + + add climb, p000a, s000 + add,dc p064a, %r0, climb + add ma000, s000, s000 + add,dc ma064, climb, climb + std s000, 0(rp) + + cmpib,>= 4, n, L(done) + ldo 8(rp), rp + +C 4-way unrolled code. + +LDEF(BIG) + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C +define(`p096b1',`%r20') C +define(`p096b2',`%r21') C +define(`p160c1',`%r22') C +define(`p160c2',`%r29') C +define(`p224d1',`%r31') C +define(`p224d2',`%r3') C + C +define(`m032',`%r4') C +define(`m096',`%r5') C +define(`m160',`%r6') C +define(`m224',`%r7') C +define(`m288',`%r8') C + C +define(`p000a',`%r1') C +define(`p064a',`%r19') C +define(`p064b',`%r20') C +define(`p128b',`%r21') C +define(`p128c',`%r22') C +define(`p192c',`%r29') C +define(`p192d',`%r31') C +define(`p256d',`%r3') C + C +define(`s000',`%r10') C +define(`s064',`%r11') C +define(`s128',`%r12') C +define(`s192',`%r13') C + C +define(`ma000',`%r9') C +define(`ma064',`%r4') C +define(`ma128',`%r5') C +define(`ma192',`%r6') C +define(`ma256',`%r7') C + + std %r6, -0xe8(%r30) + std %r7, -0xe0(%r30) + std %r8, -0xd8(%r30) + std %r9, -0xd0(%r30) + std %r10, -0xc8(%r30) + std %r11, -0xc0(%r30) + std %r12, -0xb8(%r30) + std %r13, -0xb0(%r30) + +ifdef(`HAVE_ABI_2_0w', +` extrd,u n, 61, 62, n C right shift 2 +',` extrd,u n, 61, 30, n C right shift 2, zero extend +') + +LDEF(4_or_more) + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,<> -1, n, L(8_or_more) + xmpyu %fr8L, %fr7L, %fr27 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + b L(end1) + nop + +LDEF(8_or_more) + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,= -1, n, L(end2) + xmpyu %fr8L, %fr7L, %fr27 +LDEF(loop) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + + add,dc p064a, p064b, s064 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + + add,dc p192c, p192d, s192 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + + add ma000, s000, s000 C accum mid 0 + fldd 0(up), %fr4 + add,dc ma064, s064, s064 C accum mid 1 + std s000, 0(rp) + + add,dc ma128, s128, s128 C accum mid 2 + fldd 8(up), %fr5 + add,dc ma192, s192, s192 C accum mid 3 + std s064, 8(rp) + + add,dc ma256, climb, climb + fldd 16(up), %fr6 + std s128, 16(rp) + + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + fldd 24(up), %fr7 + + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + std s192, 24(rp) + + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + xmpyu %fr8L, %fr7L, %fr27 + + addib,<> -1, n, L(loop) + ldo 32(rp), rp + +LDEF(end2) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + add,dc p064a, p064b, s064 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + add,dc p192c, p192d, s192 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + add ma000, s000, s000 C accum mid 0 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + std s000, 0(rp) + std s064, 8(rp) + ldd -0x78(%r30), p032a1 + std s128, 16(rp) + ldd -0x70(%r30), p032a2 + std s192, 24(rp) + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + ldo 32(rp), rp + +LDEF(end1) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + add,dc p064a, p064b, s064 + add,dc p128b, p128c, s128 + add,dc p192c, p192d, s192 + add,dc p256d, %r0, climb + add ma000, s000, s000 C accum mid 0 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + std s000, 0(rp) + std s064, 8(rp) + std s128, 16(rp) + std s192, 24(rp) + + ldd -0xb0(%r30), %r13 + ldd -0xb8(%r30), %r12 + ldd -0xc0(%r30), %r11 + ldd -0xc8(%r30), %r10 + ldd -0xd0(%r30), %r9 + ldd -0xd8(%r30), %r8 + ldd -0xe0(%r30), %r7 + ldd -0xe8(%r30), %r6 +LDEF(done) +ifdef(`HAVE_ABI_2_0w', +` copy climb, %r28 +',` extrd,u climb, 63, 32, %r29 + extrd,u climb, 31, 32, %r28 +') + ldd -0xf0(%r30), %r5 + ldd -0xf8(%r30), %r4 + bve (%r2) + ldd,mb -0x100(%r30), %r3 +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/pa64/rshift.asm b/gmp-6.3.0/mpn/pa64/rshift.asm new file mode 100644 index 0000000..cfc242e --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/rshift.asm @@ -0,0 +1,111 @@ +dnl HP-PA 2.0 mpn_rshift -- Right shift. + +dnl Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500. + +include(`../config.m4') + +dnl INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') +define(`cnt',`%r23') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_rshift) + mtsar cnt + ldd 0(up), %r21 + addib,= -1, n, L(end) + shrpd %r21, %r0, %sar, %r29 C compute carry out limb + depw,z n, 31, 3, %r28 C r28 = (size & 7) + sub %r0, n, %r22 + depw,z %r22, 28, 3, %r22 C r22 = 8 * (-size & 7) + sub up, %r22, up C offset up + blr %r28, %r0 C branch into jump table + sub rp, %r22, rp C offset rp + b L(0) + nop + b L(1) + copy %r21, %r20 + b L(2) + nop + b L(3) + copy %r21, %r20 + b L(4) + nop + b L(5) + copy %r21, %r20 + b L(6) + nop + b L(7) + copy %r21, %r20 + +LDEF(loop) +LDEF(0) ldd 8(up), %r20 + shrpd %r20, %r21, %sar, %r21 + std %r21, 0(rp) +LDEF(7) ldd 16(up), %r21 + shrpd %r21, %r20, %sar, %r20 + std %r20, 8(rp) +LDEF(6) ldd 24(up), %r20 + shrpd %r20, %r21, %sar, %r21 + std %r21, 16(rp) +LDEF(5) ldd 32(up), %r21 + shrpd %r21, %r20, %sar, %r20 + std %r20, 24(rp) +LDEF(4) ldd 40(up), %r20 + shrpd %r20, %r21, %sar, %r21 + std %r21, 32(rp) +LDEF(3) ldd 48(up), %r21 + shrpd %r21, %r20, %sar, %r20 + std %r20, 40(rp) +LDEF(2) ldd 56(up), %r20 + shrpd %r20, %r21, %sar, %r21 + std %r21, 48(rp) +LDEF(1) ldd 64(up), %r21 + ldo 64(up), up + shrpd %r21, %r20, %sar, %r20 + std %r20, 56(rp) + addib,> -8, n, L(loop) + ldo 64(rp), rp + +LDEF(end) + shrpd %r0, %r21, %sar, %r21 + std %r21, 0(rp) + bve (%r2) +ifdef(`HAVE_ABI_2_0w', +` copy %r29,%r28 +',` extrd,u %r29, 31, 32, %r28 +') +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/pa64/sqr_diagonal.asm b/gmp-6.3.0/mpn/pa64/sqr_diagonal.asm new file mode 100644 index 0000000..f6fadc9 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/sqr_diagonal.asm @@ -0,0 +1,191 @@ +dnl HP-PA 2.0 64-bit mpn_sqr_diagonal. + +dnl Copyright 2001-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on +dnl PA8500. The cache would saturate at 5 cycles/limb, so there is some room +dnl for optimization. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp',`%r26') +define(`up',`%r25') +define(`n',`%r24') + +define(`p00',`%r28') +define(`p32',`%r29') +define(`p64',`%r31') +define(`t0',`%r19') +define(`t1',`%r20') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_sqr_diagonal) + ldo 128(%r30),%r30 + + fldds,ma 8(up),%fr8 + addib,= -1,n,L(end1) + nop + fldds,ma 8(up),%fr4 + xmpyu %fr8l,%fr8r,%fr10 + fstd %fr10,-120(%r30) + xmpyu %fr8r,%fr8r,%fr9 + fstd %fr9,0(rp) + xmpyu %fr8l,%fr8l,%fr11 + fstd %fr11,8(rp) + addib,= -1,n,L(end2) + ldo 16(rp),rp + +LDEF(loop) + fldds,ma 8(up),%fr8 C load next up limb + xmpyu %fr4l,%fr4r,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr4r,%fr4r,%fr5 C multiply in fp regs + fstd %fr5,0(rp) + xmpyu %fr4l,%fr4l,%fr7 + fstd %fr7,8(rp) + ldd -120(%r30),p32 + ldd -16(rp),p00 C accumulate in int regs + ldd -8(rp),p64 + depd,z p32,30,31,t0 + add t0,p00,p00 + std p00,-16(rp) + extrd,u p32,32,33,t1 + add,dc t1,p64,p64 + std p64,-8(rp) + addib,= -1,n,L(exit) + ldo 16(rp),rp + + fldds,ma 8(up),%fr4 + xmpyu %fr8l,%fr8r,%fr10 + fstd %fr10,-120(%r30) + xmpyu %fr8r,%fr8r,%fr9 + fstd %fr9,0(rp) + xmpyu %fr8l,%fr8l,%fr11 + fstd %fr11,8(rp) + ldd -128(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,30,31,t0 + add t0,p00,p00 + std p00,-16(rp) + extrd,u p32,32,33,t1 + add,dc t1,p64,p64 + std p64,-8(rp) + addib,<> -1,n,L(loop) + ldo 16(rp),rp + +LDEF(end2) + xmpyu %fr4l,%fr4r,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr4r,%fr4r,%fr5 + fstd %fr5,0(rp) + xmpyu %fr4l,%fr4l,%fr7 + fstd %fr7,8(rp) + ldd -120(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,30,31,t0 + add t0,p00,p00 + std p00,-16(rp) + extrd,u p32,32,33,t1 + add,dc t1,p64,p64 + std p64,-8(rp) + ldo 16(rp),rp + ldd -128(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,30,31,t0 + add t0,p00,p00 + std p00,-16(rp) + extrd,u p32,32,33,t1 + add,dc t1,p64,p64 + std p64,-8(rp) + bve (%r2) + ldo -128(%r30),%r30 + +LDEF(exit) + xmpyu %fr8l,%fr8r,%fr10 + fstd %fr10,-120(%r30) + xmpyu %fr8r,%fr8r,%fr9 + fstd %fr9,0(rp) + xmpyu %fr8l,%fr8l,%fr11 + fstd %fr11,8(rp) + ldd -128(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,31,32,t0 + add t0,p00,p00 + extrd,u p32,31,32,t1 + add,dc t1,p64,p64 + add t0,p00,p00 + add,dc t1,p64,p64 + std p00,-16(rp) + std p64,-8(rp) + ldo 16(rp),rp + ldd -120(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,31,32,t0 + add t0,p00,p00 + extrd,u p32,31,32,t1 + add,dc t1,p64,p64 + add t0,p00,p00 + add,dc t1,p64,p64 + std p00,-16(rp) + std p64,-8(rp) + bve (%r2) + ldo -128(%r30),%r30 + +LDEF(end1) + xmpyu %fr8l,%fr8r,%fr10 + fstd %fr10,-128(%r30) + xmpyu %fr8r,%fr8r,%fr9 + fstd %fr9,0(rp) + xmpyu %fr8l,%fr8l,%fr11 + fstd %fr11,8(rp) + ldo 16(rp),rp + ldd -128(%r30),p32 + ldd -16(rp),p00 + ldd -8(rp),p64 + depd,z p32,31,32,t0 + add t0,p00,p00 + extrd,u p32,31,32,t1 + add,dc t1,p64,p64 + add t0,p00,p00 + add,dc t1,p64,p64 + std p00,-16(rp) + std p64,-8(rp) + bve (%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/pa64/submul_1.asm b/gmp-6.3.0/mpn/pa64/submul_1.asm new file mode 100644 index 0000000..f8a1968 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/submul_1.asm @@ -0,0 +1,700 @@ +dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 8000,8200: 7 +C 8500,8600,8700: 6.5 + +C The feed-in and wind-down code has not yet been scheduled. Many cycles +C could be saved there per call. + +C DESCRIPTION: +C The main loop "BIG" is 4-way unrolled, mainly to allow +C effective use of ADD,DC. Delays in moving data via the cache from the FP +C registers to the IU registers, have demanded a deep software pipeline, and +C a lot of stack slots for partial products in flight. +C +C CODE STRUCTURE: +C save-some-registers +C do 0, 1, 2, or 3 limbs +C if done, restore-some-regs and return +C save-many-regs +C do 4, 8, ... limb +C restore-all-regs + +C STACK LAYOUT: +C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the +C slots marked FREE, as well as some slots in the caller's "frame marker". +C +C -00 <- r30 +C -08 FREE +C -10 tmp +C -18 tmp +C -20 tmp +C -28 tmp +C -30 tmp +C -38 tmp +C -40 tmp +C -48 tmp +C -50 tmp +C -58 tmp +C -60 tmp +C -68 tmp +C -70 tmp +C -78 tmp +C -80 tmp +C -88 tmp +C -90 FREE +C -98 FREE +C -a0 FREE +C -a8 FREE +C -b0 r13 +C -b8 r12 +C -c0 r11 +C -c8 r10 +C -d0 r8 +C -d8 r8 +C -e0 r7 +C -e8 r6 +C -f0 r5 +C -f8 r4 +C -100 r3 +C Previous frame: +C [unused area] +C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here. + + +include(`../config.m4') + +C INPUT PARAMETERS: +define(`rp',`%r26') C +define(`up',`%r25') C +define(`n',`%r24') C +define(`vlimb',`%r23') C + +define(`climb',`%r23') C + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_submul_1) + +ifdef(`HAVE_ABI_2_0w', +` std vlimb, -0x38(%r30) C store vlimb into "home" slot +') + std,ma %r3, 0x100(%r30) + std %r4, -0xf8(%r30) + std %r5, -0xf0(%r30) + ldo 0(%r0), climb C clear climb + fldd -0x138(%r30), %fr8 C put vlimb in fp register + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C + +define(`m032',`%r20') C +define(`m096',`%r21') C + +define(`p000a',`%r22') C +define(`p064a',`%r29') C + +define(`s000',`%r31') C + +define(`ma000',`%r4') C +define(`ma064',`%r20') C + +define(`r000',`%r3') C + + extrd,u n, 63, 2, %r5 + cmpb,= %r5, %r0, L(BIG) + nop + + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + addib,<> -1, %r5, L(two_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(one) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x80(%r30), p000a + b L(0_one_out) + ldd -0x68(%r30), p064a + +LDEF(two_or_more) + fldd 0(up), %fr4 + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + ldd -0x68(%r30), p064a + addib,<> -1, %r5, L(three_or_more) + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +LDEF(two) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + b L(0_two_out) + depd m096, 31, 32, ma064 + +LDEF(three_or_more) + fldd 0(up), %fr4 + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 +C addib,= -1, %r5, L(0_out) + depd m096, 31, 32, ma064 +LDEF(loop0) +C xmpyu %fr8R, %fr4L, %fr22 +C xmpyu %fr8L, %fr4R, %fr23 +C ldd -0x78(%r30), p032a1 +C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 +C +C xmpyu %fr8R, %fr4R, %fr24 +C xmpyu %fr8L, %fr4L, %fr25 +C ldd -0x70(%r30), p032a2 +C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 +C +C ldo 8(rp), rp +C add climb, p000a, s000 +C ldd -0x80(%r30), p000a +C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 +C +C add,dc p064a, %r0, climb +C ldo 8(up), up +C ldd -0x68(%r30), p064a +C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 +C +C add ma000, s000, s000 +C add,dc ma064, climb, climb +C fldd 0(up), %fr4 +C +C sub r000, s000, s000 +C sub,db %r0, climb, climb +C sub %r0, climb, climb +C std s000, -8(rp) +C +C add p032a1, p032a2, m032 +C add,dc %r0, %r0, m096 +C +C depd,z m032, 31, 32, ma000 +C extrd,u m032, 31, 32, ma064 +C ldd 0(rp), r000 +C addib,<> -1, %r5, L(loop0) +C depd m096, 31, 32, ma064 +LDEF(0_out) + ldo 8(up), up + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + ldd -0x78(%r30), p032a1 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr4R, %fr24 + xmpyu %fr8L, %fr4L, %fr25 + ldd -0x70(%r30), p032a2 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 + add ma000, s000, s000 + add,dc ma064, climb, climb + sub r000, s000, s000 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s000, -8(rp) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + depd m096, 31, 32, ma064 +LDEF(0_two_out) + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldo 8(rp), rp + add climb, p000a, s000 + ldd -0x80(%r30), p000a + add,dc p064a, %r0, climb + ldd -0x68(%r30), p064a + add ma000, s000, s000 + add,dc ma064, climb, climb + sub r000, s000, s000 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s000, -8(rp) +LDEF(0_one_out) + add p032a1, p032a2, m032 + add,dc %r0, %r0, m096 + depd,z m032, 31, 32, ma000 + extrd,u m032, 31, 32, ma064 + ldd 0(rp), r000 + depd m096, 31, 32, ma064 + + add climb, p000a, s000 + add,dc p064a, %r0, climb + add ma000, s000, s000 + add,dc ma064, climb, climb + sub r000, s000, s000 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s000, 0(rp) + + cmpib,>= 4, n, L(done) + ldo 8(rp), rp + +C 4-way unrolled code. + +LDEF(BIG) + +define(`p032a1',`%r1') C +define(`p032a2',`%r19') C +define(`p096b1',`%r20') C +define(`p096b2',`%r21') C +define(`p160c1',`%r22') C +define(`p160c2',`%r29') C +define(`p224d1',`%r31') C +define(`p224d2',`%r3') C + C +define(`m032',`%r4') C +define(`m096',`%r5') C +define(`m160',`%r6') C +define(`m224',`%r7') C +define(`m288',`%r8') C + C +define(`p000a',`%r1') C +define(`p064a',`%r19') C +define(`p064b',`%r20') C +define(`p128b',`%r21') C +define(`p128c',`%r22') C +define(`p192c',`%r29') C +define(`p192d',`%r31') C +define(`p256d',`%r3') C + C +define(`s000',`%r10') C +define(`s064',`%r11') C +define(`s128',`%r12') C +define(`s192',`%r13') C + C +define(`ma000',`%r9') C +define(`ma064',`%r4') C +define(`ma128',`%r5') C +define(`ma192',`%r6') C +define(`ma256',`%r7') C + C +define(`r000',`%r1') C +define(`r064',`%r19') C +define(`r128',`%r20') C +define(`r192',`%r21') C + + std %r6, -0xe8(%r30) + std %r7, -0xe0(%r30) + std %r8, -0xd8(%r30) + std %r9, -0xd0(%r30) + std %r10, -0xc8(%r30) + std %r11, -0xc0(%r30) + std %r12, -0xb8(%r30) + std %r13, -0xb0(%r30) + +ifdef(`HAVE_ABI_2_0w', +` extrd,u n, 61, 62, n C right shift 2 +',` extrd,u n, 61, 30, n C right shift 2, zero extend +') + +LDEF(4_or_more) + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,<> -1, n, L(8_or_more) + xmpyu %fr8L, %fr7L, %fr27 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd -0x78(%r30), p032a1 + ldd -0x70(%r30), p032a2 + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + b L(end1) + nop + +LDEF(8_or_more) + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + fldd 0(up), %fr4 + fldd 8(up), %fr5 + fldd 16(up), %fr6 + fldd 24(up), %fr7 + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + addib,= -1, n, L(end2) + xmpyu %fr8L, %fr7L, %fr27 +LDEF(loop) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + ldo 32(up), up + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + + add,dc ma128, s128, s128 C accum mid 2 + fldd 0(up), %fr4 + add,dc ma192, s192, s192 C accum mid 3 + fldd 8(up), %fr5 + + add,dc ma256, climb, climb + fldd 16(up), %fr6 + sub r000, s000, s000 C accum rlimb 0 + fldd 24(up), %fr7 + + sub,db r064, s064, s064 C accum rlimb 1 + sub,db r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + + sub,db r192, s192, s192 C accum rlimb 3 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s064, 8(rp) + + xmpyu %fr8R, %fr4L, %fr22 + ldd -0x78(%r30), p032a1 + xmpyu %fr8L, %fr4R, %fr23 + std s128, 16(rp) + + xmpyu %fr8R, %fr5L, %fr24 + ldd -0x70(%r30), p032a2 + xmpyu %fr8L, %fr5R, %fr25 + std s192, 24(rp) + + xmpyu %fr8R, %fr6L, %fr26 + ldd -0x38(%r30), p096b1 + xmpyu %fr8L, %fr6R, %fr27 + fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 + + xmpyu %fr8R, %fr7L, %fr28 + ldd -0x30(%r30), p096b2 + xmpyu %fr8L, %fr7R, %fr29 + fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 + + xmpyu %fr8R, %fr4R, %fr30 + ldd -0x58(%r30), p160c1 + xmpyu %fr8L, %fr4L, %fr31 + fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 + + xmpyu %fr8R, %fr5R, %fr22 + ldd -0x50(%r30), p160c2 + xmpyu %fr8L, %fr5L, %fr23 + fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 + + xmpyu %fr8R, %fr6R, %fr24 + ldd -0x18(%r30), p224d1 + xmpyu %fr8L, %fr6L, %fr25 + fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 + + xmpyu %fr8R, %fr7R, %fr26 + ldd -0x10(%r30), p224d2 + fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 + xmpyu %fr8L, %fr7L, %fr27 + + addib,<> -1, n, L(loop) + ldo 32(rp), rp + +LDEF(end2) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + sub r000, s000, s000 C accum rlimb 0 + sub,db r064, s064, s064 C accum rlimb 1 + sub,db r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + sub,db r192, s192, s192 C accum rlimb 3 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s064, 8(rp) + ldd -0x78(%r30), p032a1 + std s128, 16(rp) + ldd -0x70(%r30), p032a2 + std s192, 24(rp) + ldd -0x38(%r30), p096b1 + ldd -0x30(%r30), p096b2 + ldd -0x58(%r30), p160c1 + ldd -0x50(%r30), p160c2 + ldd -0x18(%r30), p224d1 + ldd -0x10(%r30), p224d2 + ldo 32(rp), rp + +LDEF(end1) + add p032a1, p032a2, m032 + ldd -0x80(%r30), p000a + add,dc p096b1, p096b2, m096 + add,dc p160c1, p160c2, m160 + ldd -0x68(%r30), p064a + add,dc p224d1, p224d2, m224 + add,dc %r0, %r0, m288 + ldd -0x40(%r30), p064b + depd,z m032, 31, 32, ma000 + ldd -0x28(%r30), p128b + extrd,u m032, 31, 32, ma064 + depd m096, 31, 32, ma064 + ldd -0x60(%r30), p128c + extrd,u m096, 31, 32, ma128 + depd m160, 31, 32, ma128 + ldd -0x48(%r30), p192c + extrd,u m160, 31, 32, ma192 + depd m224, 31, 32, ma192 + ldd -0x20(%r30), p192d + extrd,u m224, 31, 32, ma256 + depd m288, 31, 32, ma256 + ldd -0x88(%r30), p256d + add climb, p000a, s000 + add,dc p064a, p064b, s064 + ldd 0(rp), r000 + add,dc p128b, p128c, s128 + add,dc p192c, p192d, s192 + ldd 8(rp), r064 + add,dc p256d, %r0, climb + ldd 16(rp), r128 + add ma000, s000, s000 C accum mid 0 + ldd 24(rp), r192 + add,dc ma064, s064, s064 C accum mid 1 + add,dc ma128, s128, s128 C accum mid 2 + add,dc ma192, s192, s192 C accum mid 3 + add,dc ma256, climb, climb + sub r000, s000, s000 C accum rlimb 0 + sub,db r064, s064, s064 C accum rlimb 1 + sub,db r128, s128, s128 C accum rlimb 2 + std s000, 0(rp) + sub,db r192, s192, s192 C accum rlimb 3 + sub,db %r0, climb, climb + sub %r0, climb, climb + std s064, 8(rp) + std s128, 16(rp) + std s192, 24(rp) + + ldd -0xb0(%r30), %r13 + ldd -0xb8(%r30), %r12 + ldd -0xc0(%r30), %r11 + ldd -0xc8(%r30), %r10 + ldd -0xd0(%r30), %r9 + ldd -0xd8(%r30), %r8 + ldd -0xe0(%r30), %r7 + ldd -0xe8(%r30), %r6 +LDEF(done) +ifdef(`HAVE_ABI_2_0w', +` copy climb, %r28 +',` extrd,u climb, 63, 32, %r29 + extrd,u climb, 31, 32, %r28 +') + ldd -0xf0(%r30), %r5 + ldd -0xf8(%r30), %r4 + bve (%r2) + ldd,mb -0x100(%r30), %r3 +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/pa64/udiv.asm b/gmp-6.3.0/mpn/pa64/udiv.asm new file mode 100644 index 0000000..1380a85 --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/udiv.asm @@ -0,0 +1,125 @@ +dnl HP-PA 2.0 64-bit mpn_udiv_qrnnd_r. + +dnl Copyright 2001-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This runs at about 280 cycles on both PA8000 and PA8500, corresponding to a +C bit more than 4 cycles/bit. + +C INPUT PARAMETERS +define(`n1',`%r26') +define(`n0',`%r25') +define(`d',`%r24') +define(`remptr',`%r23') + +define(`q',`%r28') +define(`dn',`%r29') + +define(`old_divstep', + `add,dc n0,n0,n0 + add,dc n1,n1,n1 + sub,*<< n1,d,%r22 + copy %r22,n1') + +define(`divstep', + `add n0,n0,n0 + add,dc n1,n1,n1 + sub n1,d,%r1 + add,dc q,q,q + cmpclr,*<< n1,d,%r0 + copy %r1,n1 +') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_udiv_qrnnd_r) +ifdef(`HAVE_ABI_2_0n', +` depd %r25,31,32,%r26 + depd %r23,31,32,%r24 + copy %r24,%r25 + ldd -56(%r30),%r24 + ldw -60(%r30),%r23 +') + ldi 0,q + cmpib,*>= 0,d,L(large_divisor) + ldi 8,%r31 C setup loop counter + + sub %r0,d,dn +LDEF(Loop) + divstep divstep divstep divstep divstep divstep divstep divstep + addib,<> -1,%r31,L(Loop) + nop + +ifdef(`HAVE_ABI_2_0n', +` copy %r28,%r29 + extrd,u %r28,31,32,%r28 +') + bve (%r2) + std n1,0(remptr) C store remainder + +LDEF(large_divisor) + extrd,u n0,63,1,%r19 C save lsb of dividend + shrpd n1,n0,1,n0 C n0 = lo(n1n0 >> 1) + shrpd %r0,n1,1,n1 C n1 = hi(n1n0 >> 1) + extrd,u d,63,1,%r20 C save lsb of divisor + shrpd %r0,d,1,d C d = floor(orig_d / 2) + add,l %r20,d,d C d = ceil(orig_d / 2) + + sub %r0,d,dn +LDEF(Loop2) + divstep divstep divstep divstep divstep divstep divstep divstep + addib,<> -1,%r31,L(Loop2) + nop + + cmpib,*= 0,%r20,L(even_divisor) + shladd n1,1,%r19,n1 C shift in omitted dividend lsb + + add d,d,d C restore orig... + sub d,%r20,d C ...d value + sub %r0,d,dn C r21 = -d + + add,*nuv n1,q,n1 C fix remainder for omitted divisor lsb + add,l n1,dn,n1 C adjust remainder if rem. fix carried + add,dc %r0,q,q C adjust quotient accordingly + + sub,*<< n1,d,%r0 C remainder >= divisor? + add,l n1,dn,n1 C adjust remainder + add,dc %r0,q,q C adjust quotient + +LDEF(even_divisor) +ifdef(`HAVE_ABI_2_0n', +` copy %r28,%r29 + extrd,u %r28,31,32,%r28 +') + bve (%r2) + std n1,0(remptr) C store remainder +EPILOGUE(mpn_udiv_qrnnd_r) diff --git a/gmp-6.3.0/mpn/pa64/umul.asm b/gmp-6.3.0/mpn/pa64/umul.asm new file mode 100644 index 0000000..bd5a71f --- /dev/null +++ b/gmp-6.3.0/mpn/pa64/umul.asm @@ -0,0 +1,97 @@ +dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Optimizations: +dnl * Avoid skip instructions +dnl * Put carry-generating and carry-consuming insns consecutively +dnl * Don't allocate any stack, "home" positions for parameters could be used. + +include(`../config.m4') + +define(`p0',`%r28') +define(`p1',`%r29') +define(`t32',`%r19') +define(`t0',`%r20') +define(`t1',`%r21') +define(`x',`%r22') +define(`m0',`%r23') +define(`m1',`%r24') + +ifdef(`HAVE_ABI_2_0w', +` .level 2.0w +',` .level 2.0 +') +PROLOGUE(mpn_umul_ppmm_r) + ldo 128(%r30),%r30 +ifdef(`HAVE_ABI_2_0w', +` std %r26,-64(%r30) + std %r25,-56(%r30) + copy %r24,%r31 +',` + depd %r25,31,32,%r26 + std %r26,-64(%r30) + depd %r23,31,32,%r24 + std %r24,-56(%r30) + ldw -180(%r30),%r31 +') + + fldd -64(%r30),%fr4 + fldd -56(%r30),%fr5 + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + + depdi,z 1,31,1,t32 C t32 = 2^32 + + ldd -128(%r30),p0 C lo = low 64 bit of product + ldd -120(%r30),m0 C m0 = mid0 64 bit of product + ldd -112(%r30),m1 C m1 = mid1 64 bit of product + ldd -104(%r30),p1 C hi = high 64 bit of product + + add,l,*nuv m0,m1,x C x = m1+m0 + add,l t32,p1,p1 C propagate carry to mid of p1 + depd,z x,31,32,t0 C lo32(m1+m0) + add t0,p0,p0 + extrd,u x,31,32,t1 C hi32(m1+m0) + add,dc t1,p1,p1 + + std p0,0(%r31) C store low half of product +ifdef(`HAVE_ABI_2_0w', +` copy p1,%r28 C return val in %r28 +',` extrd,u p1,31,32,%r28 C return val in %r28,%r29 +') + bve (%r2) + ldo -128(%r30),%r30 +EPILOGUE(mpn_umul_ppmm_r) diff --git a/gmp-6.3.0/mpn/perfpow.c b/gmp-6.3.0/mpn/perfpow.c new file mode 120000 index 0000000..c896043 --- /dev/null +++ b/gmp-6.3.0/mpn/perfpow.c @@ -0,0 +1 @@ +../mpn/generic/perfpow.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/perfsqr.c b/gmp-6.3.0/mpn/perfsqr.c new file mode 120000 index 0000000..4478749 --- /dev/null +++ b/gmp-6.3.0/mpn/perfsqr.c @@ -0,0 +1 @@ +../mpn/generic/perfsqr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/perfsqr.h b/gmp-6.3.0/mpn/perfsqr.h new file mode 100644 index 0000000..af9a40e --- /dev/null +++ b/gmp-6.3.0/mpn/perfsqr.h @@ -0,0 +1,50 @@ +/* This file generated by gen-psqr.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 32 || GMP_NAIL_BITS != 0 +Error, error, this data is for 32 bit limb and 0 bit nail +#endif + +/* Non-zero bit indicates a quadratic residue mod 0x100. + This test identifies 82.81% as non-squares (212/256). */ +static const mp_limb_t +sq_res_0x100[8] = { + CNST_LIMB(0x2030213), + CNST_LIMB(0x2020212), + CNST_LIMB(0x2020213), + CNST_LIMB(0x2020212), + CNST_LIMB(0x2030212), + CNST_LIMB(0x2020212), + CNST_LIMB(0x2020212), + CNST_LIMB(0x2020212), +}; + +/* 2^24-1 = 3^2 * 5 * 7 * 13 * 17 ... */ +#define PERFSQR_MOD_BITS 25 + +/* This test identifies 95.66% as non-squares. */ +#define PERFSQR_MOD_TEST(up, usize) \ + do { \ + mp_limb_t r; \ + PERFSQR_MOD_34 (r, up, usize); \ + \ + /* 73.33% */ \ + PERFSQR_MOD_2 (r, CNST_LIMB(45), CNST_LIMB(0xfa4fa5), \ + CNST_LIMB(0x920), CNST_LIMB(0x1a442481)); \ + \ + /* 47.06% */ \ + PERFSQR_MOD_1 (r, CNST_LIMB(17), CNST_LIMB(0xf0f0f1), \ + CNST_LIMB(0x1a317)); \ + \ + /* 46.15% */ \ + PERFSQR_MOD_1 (r, CNST_LIMB(13), CNST_LIMB(0xec4ec5), \ + CNST_LIMB(0x9e5)); \ + \ + /* 42.86% */ \ + PERFSQR_MOD_1 (r, CNST_LIMB( 7), CNST_LIMB(0xdb6db7), \ + CNST_LIMB(0x69)); \ + } while (0) + +/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, 99.25% non-squares. */ + +/* helper for tests/mpz/t-perfsqr.c */ +#define PERFSQR_DIVISORS { 256, 45, 17, 13, 7, } diff --git a/gmp-6.3.0/mpn/popcount.asm b/gmp-6.3.0/mpn/popcount.asm new file mode 120000 index 0000000..984e0da --- /dev/null +++ b/gmp-6.3.0/mpn/popcount.asm @@ -0,0 +1 @@ +../mpn/x86/p6/sse2/popcount.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/pow_1.c b/gmp-6.3.0/mpn/pow_1.c new file mode 120000 index 0000000..55ffecf --- /dev/null +++ b/gmp-6.3.0/mpn/pow_1.c @@ -0,0 +1 @@ +../mpn/generic/pow_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/power/add_n.asm b/gmp-6.3.0/mpn/power/add_n.asm new file mode 100644 index 0000000..6d6ca73 --- /dev/null +++ b/gmp-6.3.0/mpn/power/add_n.asm @@ -0,0 +1,83 @@ +dnl IBM POWER mpn_add_n -- Add two limb vectors of equal, non-zero length. + +dnl Copyright 1992, 1994-1996, 1999-2001, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl s2_ptr r5 +dnl size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_add_n) + andil. 10,6,1 C odd or even number of limbs? + l 8,0(4) C load least significant s1 limb + l 0,0(5) C load least significant s2 limb + cal 3,-4(3) C offset res_ptr, it's updated before it's used + sri 10,6,1 C count for unrolled loop + a 7,0,8 C add least significant limbs, set cy + mtctr 10 C copy count into CTR + beq 0,Leven C branch if even # of limbs (# of limbs >= 2) + +C We have an odd # of limbs. Add the first limbs separately. + cmpi 1,10,0 C is count for unrolled loop zero? + bc 4,6,L1 C bne cr1,L1 (misassembled by gas) + st 7,4(3) + aze 3,10 C use the fact that r10 is zero... + br C return + +C We added least significant limbs. Now reload the next limbs to enter loop. +L1: lu 8,4(4) C load s1 limb and update s1_ptr + lu 0,4(5) C load s2 limb and update s2_ptr + stu 7,4(3) + ae 7,0,8 C add limbs, set cy +Leven: lu 9,4(4) C load s1 limb and update s1_ptr + lu 10,4(5) C load s2 limb and update s2_ptr + bdz Lend C If done, skip loop + +Loop: lu 8,4(4) C load s1 limb and update s1_ptr + lu 0,4(5) C load s2 limb and update s2_ptr + ae 11,10,9 C add previous limbs with cy, set cy + stu 7,4(3) C + lu 9,4(4) C load s1 limb and update s1_ptr + lu 10,4(5) C load s2 limb and update s2_ptr + ae 7,0,8 C add previous limbs with cy, set cy + stu 11,4(3) C + bdn Loop C decrement CTR and loop back + +Lend: ae 11,10,9 C add limbs with cy, set cy + st 7,4(3) C + st 11,8(3) C + lil 3,0 C load cy into ... + aze 3,3 C ... return value register + br +EPILOGUE(mpn_add_n) diff --git a/gmp-6.3.0/mpn/power/addmul_1.asm b/gmp-6.3.0/mpn/power/addmul_1.asm new file mode 100644 index 0000000..76d8df3 --- /dev/null +++ b/gmp-6.3.0/mpn/power/addmul_1.asm @@ -0,0 +1,126 @@ +dnl IBM POWER mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl The POWER architecture has no unsigned 32x32->64 bit multiplication +dnl instruction. To obtain that operation, we have to use the 32x32->64 +dnl signed multiplication instruction, and add the appropriate compensation to +dnl the high limb of the result. We add the multiplicand if the multiplier +dnl has its most significant bit set, and we add the multiplier if the +dnl multiplicand has its most significant bit set. We need to preserve the +dnl carry flag between each iteration, so we have to compute the compensation +dnl carefully (the natural, srai+and doesn't work). Since all POWER can +dnl branch in zero cycles, we use conditional branches for the compensation. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_addmul_1) + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + cax 9,9,7 + l 7,4(3) + a 8,8,7 C add res_limb + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 C low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 C propagate cy to new cy_limb + a 8,8,7 C add res_limb + bge Lp0 + cax 10,10,6 C adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + l 7,4(3) + aze 9,9 + a 8,8,7 + bge Lp1 + cax 9,9,6 C adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 8,7,9 + l 7,4(3) + ae 10,10,0 C propagate cy to new cy_limb + a 8,8,7 C add res_limb + bge Ln0 + cax 10,10,6 C adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 8,7,10 + l 7,4(3) + ae 9,9,0 C propagate cy to new cy_limb + a 8,8,7 C add res_limb + bge Ln1 + cax 9,9,6 C adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/power/gmp-mparam.h b/gmp-6.3.0/mpn/power/gmp-mparam.h new file mode 100644 index 0000000..7cb36f9 --- /dev/null +++ b/gmp-6.3.0/mpn/power/gmp-mparam.h @@ -0,0 +1,69 @@ +/* POWER gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2002-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* Generated by tuneup.c, 2003-02-10, gcc 3.2, POWER2 66.7MHz */ + +#define MUL_TOOM22_THRESHOLD 12 +#define MUL_TOOM33_THRESHOLD 75 + +#define SQR_BASECASE_THRESHOLD 7 +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 86 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_DC_THRESHOLD 36 +#define POWM_THRESHOLD 69 + +#define HGCD_THRESHOLD 97 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 590 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 12 +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_NORM_THRESHOLD 10 +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define USE_PREINV_DIVREM_1 0 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 11 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_THRESHOLD 2899 + +#define MUL_FFT_TABLE { 336, 800, 1408, 3584, 10240, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 296 +#define MUL_FFT_THRESHOLD 2304 + +#define SQR_FFT_TABLE { 336, 800, 1408, 3584, 10240, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 296 +#define SQR_FFT_THRESHOLD 2304 diff --git a/gmp-6.3.0/mpn/power/lshift.asm b/gmp-6.3.0/mpn/power/lshift.asm new file mode 100644 index 0000000..efa2105 --- /dev/null +++ b/gmp-6.3.0/mpn/power/lshift.asm @@ -0,0 +1,61 @@ +dnl IBM POWER mpn_lshift -- Shift a number left. + +dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s_ptr r4 +dnl size r5 +dnl cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_lshift) + sli 0,5,2 + cax 9,3,0 + cax 4,4,0 + sfi 8,6,32 + mtctr 5 C put limb count in CTR loop register + lu 0,-4(4) C read most significant limb + sre 3,0,8 C compute carry out limb, and init MQ register + bdz Lend2 C if just one limb, skip loop + lu 0,-4(4) C read 2:nd most significant limb + sreq 7,0,8 C compute most significant limb of result + bdz Lend C if just two limb, skip loop +Loop: lu 0,-4(4) C load next lower limb + stu 7,-4(9) C store previous result during read latency + sreq 7,0,8 C compute result limb + bdn Loop C loop back until CTR is zero +Lend: stu 7,-4(9) C store 2:nd least significant limb +Lend2: sle 7,0,6 C compute least significant limb + st 7,-4(9) C store it + br +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/power/mul_1.asm b/gmp-6.3.0/mpn/power/mul_1.asm new file mode 100644 index 0000000..38b7b66 --- /dev/null +++ b/gmp-6.3.0/mpn/power/mul_1.asm @@ -0,0 +1,113 @@ +dnl IBM POWER mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl The POWER architecture has no unsigned 32x32->64 bit multiplication +dnl instruction. To obtain that operation, we have to use the 32x32->64 +dnl signed multiplication instruction, and add the appropriate compensation to +dnl the high limb of the result. We add the multiplicand if the multiplier +dnl has its most significant bit set, and we add the multiplier if the +dnl multiplicand has its most significant bit set. We need to preserve the +dnl carry flag between each iteration, so we have to compute the compensation +dnl carefully (the natural, srai+and doesn't work). Since all POWER can +dnl branch in zero cycles, we use conditional branches for the compensation. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_mul_1) + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + ai 0,0,0 C reset carry + cax 9,9,7 + blt Lneg +Lpos: bdz Lend +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 + bge Lp0 + cax 10,10,6 C adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + bge Lp1 + cax 9,9,6 C adjust high limb for negative limb from s1 +Lp1: bdn Lploop + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + cax 10,10,0 C adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,9 + bge Ln0 + cax 10,10,6 C adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + cax 9,9,0 C adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,10 + bge Ln1 + cax 9,9,6 C adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/power/rshift.asm b/gmp-6.3.0/mpn/power/rshift.asm new file mode 100644 index 0000000..1d1815c --- /dev/null +++ b/gmp-6.3.0/mpn/power/rshift.asm @@ -0,0 +1,59 @@ +dnl IBM POWER mpn_rshift -- Shift a number right. + +dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s_ptr r4 +dnl size r5 +dnl cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_rshift) + sfi 8,6,32 + mtctr 5 C put limb count in CTR loop register + l 0,0(4) C read least significant limb + ai 9,3,-4 C adjust res_ptr since it's offset in the stu:s + sle 3,0,8 C compute carry limb, and init MQ register + bdz Lend2 C if just one limb, skip loop + lu 0,4(4) C read 2:nd least significant limb + sleq 7,0,8 C compute least significant limb of result + bdz Lend C if just two limb, skip loop +Loop: lu 0,4(4) C load next higher limb + stu 7,4(9) C store previous result during read latency + sleq 7,0,8 C compute result limb + bdn Loop C loop back until CTR is zero +Lend: stu 7,4(9) C store 2:nd most significant limb +Lend2: sre 7,0,6 C compute most significant limb + st 7,4(9) C store it + br +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/power/sdiv.asm b/gmp-6.3.0/mpn/power/sdiv.asm new file mode 100644 index 0000000..4a9ed14 --- /dev/null +++ b/gmp-6.3.0/mpn/power/sdiv.asm @@ -0,0 +1,39 @@ +dnl Copyright 1999, 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sdiv_qrnnd) + mtmq 5 + div 0,4,6 + mfmq 9 + st 9,0(3) + mr 3,0 + br +EPILOGUE(mpn_sdiv_qrnnd) diff --git a/gmp-6.3.0/mpn/power/sub_n.asm b/gmp-6.3.0/mpn/power/sub_n.asm new file mode 100644 index 0000000..390c802 --- /dev/null +++ b/gmp-6.3.0/mpn/power/sub_n.asm @@ -0,0 +1,85 @@ +dnl IBM POWER mpn_sub_n -- Subtract two limb vectors of equal, non-zero +dnl length. + +dnl Copyright 1992, 1994-1996, 1999-2001, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl s2_ptr r5 +dnl size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sub_n) + andil. 10,6,1 C odd or even number of limbs? + l 8,0(4) C load least significant s1 limb + l 0,0(5) C load least significant s2 limb + cal 3,-4(3) C offset res_ptr, it's updated before it's used + sri 10,6,1 C count for unrolled loop + sf 7,0,8 C subtract least significant limbs, set cy + mtctr 10 C copy count into CTR + beq 0,Leven C branch if even # of limbs (# of limbs >= 2) + +C We have an odd # of limbs. Add the first limbs separately. + cmpi 1,10,0 C is count for unrolled loop zero? + bc 4,6,L1 C bne cr1,L1 (misassembled by gas) + st 7,4(3) + sfe 3,0,0 C load !cy into ... + sfi 3,3,0 C ... return value register + br C return + +C We added least significant limbs. Now reload the next limbs to enter loop. +L1: lu 8,4(4) C load s1 limb and update s1_ptr + lu 0,4(5) C load s2 limb and update s2_ptr + stu 7,4(3) + sfe 7,0,8 C subtract limbs, set cy +Leven: lu 9,4(4) C load s1 limb and update s1_ptr + lu 10,4(5) C load s2 limb and update s2_ptr + bdz Lend C If done, skip loop + +Loop: lu 8,4(4) C load s1 limb and update s1_ptr + lu 0,4(5) C load s2 limb and update s2_ptr + sfe 11,10,9 C subtract previous limbs with cy, set cy + stu 7,4(3) C + lu 9,4(4) C load s1 limb and update s1_ptr + lu 10,4(5) C load s2 limb and update s2_ptr + sfe 7,0,8 C subtract previous limbs with cy, set cy + stu 11,4(3) C + bdn Loop C decrement CTR and loop back + +Lend: sfe 11,10,9 C subtract limbs with cy, set cy + st 7,4(3) C + st 11,8(3) C + sfe 3,0,0 C load !cy into ... + sfi 3,3,0 C ... return value register + br +EPILOGUE(mpn_sub_n) diff --git a/gmp-6.3.0/mpn/power/submul_1.asm b/gmp-6.3.0/mpn/power/submul_1.asm new file mode 100644 index 0000000..1788e0d --- /dev/null +++ b/gmp-6.3.0/mpn/power/submul_1.asm @@ -0,0 +1,131 @@ +dnl IBM POWER mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl The POWER architecture has no unsigned 32x32->64 bit multiplication +dnl instruction. To obtain that operation, we have to use the 32x32->64 +dnl signed multiplication instruction, and add the appropriate compensation to +dnl the high limb of the result. We add the multiplicand if the multiplier +dnl has its most significant bit set, and we add the multiplier if the +dnl multiplicand has its most significant bit set. We need to preserve the +dnl carry flag between each iteration, so we have to compute the compensation +dnl carefully (the natural, srai+and doesn't work). Since all POWER can +dnl branch in zero cycles, we use conditional branches for the compensation. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_submul_1) + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 11 + cax 9,9,7 + l 7,4(3) + sf 8,11,7 C add res_limb + a 11,8,11 C invert cy (r11 is junk) + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 11,0,9 C low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 C propagate cy to new cy_limb + sf 8,11,7 C add res_limb + a 11,8,11 C invert cy (r11 is junk) + bge Lp0 + cax 10,10,6 C adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 11,0,10 + l 7,4(3) + aze 9,9 + sf 8,11,7 + a 11,8,11 C invert cy (r11 is junk) + bge Lp1 + cax 9,9,6 C adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 11,7,9 + l 7,4(3) + ae 10,10,0 C propagate cy to new cy_limb + sf 8,11,7 C add res_limb + a 11,8,11 C invert cy (r11 is junk) + bge Ln0 + cax 10,10,6 C adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 11,7,10 + l 7,4(3) + ae 9,9,0 C propagate cy to new cy_limb + sf 8,11,7 C add res_limb + a 11,8,11 C invert cy (r11 is junk) + bge Ln1 + cax 9,9,6 C adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/power/umul.asm b/gmp-6.3.0/mpn/power/umul.asm new file mode 100644 index 0000000..5a0599e --- /dev/null +++ b/gmp-6.3.0/mpn/power/umul.asm @@ -0,0 +1,43 @@ +dnl Copyright 1999, 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + mul 9,4,5 + srai 0,4,31 + and 0,0,5 + srai 5,5,31 + and 5,5,4 + cax 0,0,5 + mfmq 11 + st 11,0(3) + cax 3,9,0 + br +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/powerpc32/750/com.asm b/gmp-6.3.0/mpn/powerpc32/750/com.asm new file mode 100644 index 0000000..1b8b574 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/750/com.asm @@ -0,0 +1,79 @@ +dnl PowerPC 750 mpn_com -- mpn bitwise one's complement + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 603e: ? +C 604e: 3.0 +C 75x (G3): 2.0 +C 7400,7410 (G4): 2.0 +C 744x,745x (G4+): 3.0 + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C This loop form is necessary for the claimed speed. + +ASM_START() +PROLOGUE(mpn_com) + + C r3 dst + C r4 src + C r5 size + + mtctr r5 C size + lwz r5, 0(r4) C src low limb + + sub r4, r4, r3 C src-dst + subi r3, r3, 4 C dst-4 + + addi r4, r4, 8 C src-dst+8 + bdz L(one) + +L(top): + C r3 &dst[i-1] + C r4 src-dst + C r5 src[i] + C r6 scratch + + not r6, r5 C ~src[i] + lwzx r5, r4,r3 C src[i+1] + + stwu r6, 4(r3) C dst[i] + bdnz L(top) + +L(one): + not r6, r5 + + stw r6, 4(r3) C dst[size-1] + blr + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h new file mode 100644 index 0000000..3667e85 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/750/gmp-mparam.h @@ -0,0 +1,192 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2002, 2004, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* This file is used for 75x (G3) and for 7400/7410 (G4), both which have + much slow multiply instructions. */ + +/* 450 MHz PPC 7400 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 11 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 38 +#define USE_PREINV_DIVREM_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 38 +#define MUL_TOOM44_THRESHOLD 99 +#define MUL_TOOM6H_THRESHOLD 141 +#define MUL_TOOM8H_THRESHOLD 212 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 57 +#define SQR_TOOM4_THRESHOLD 142 +#define SQR_TOOM6_THRESHOLD 173 +#define SQR_TOOM8_THRESHOLD 309 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 8, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 55,10}, { 31, 9}, { 63, 8}, \ + { 127, 7}, { 255, 9}, { 71, 8}, { 143, 7}, \ + { 287, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255, 9}, \ + { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 175, 8}, { 351, 7}, { 703,10}, \ + { 95, 9}, { 191, 8}, { 383, 9}, { 207,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415, 8}, { 831,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 351, 9}, \ + { 703, 8}, { 1407,11}, { 191,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447, 9}, { 895,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 415,10}, { 831,11}, { 447,10}, \ + { 895,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 575,12}, { 319,11}, { 703,10}, \ + { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 895,10}, { 1791,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 895,11}, \ + { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1407,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1151,12}, { 2303,13}, { 1407,14}, { 767,13}, \ + { 1919,10}, { 15359,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 154 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 31, 8}, { 19, 7}, { 39, 8}, { 27, 9}, \ + { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \ + { 127, 7}, { 255, 9}, { 71, 8}, { 143, 7}, \ + { 287, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143, 8}, { 287, 7}, { 575,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \ + { 351,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 159, 9}, { 319,10}, { 175, 9}, { 351,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415, 8}, { 831,10}, { 223,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 287,10}, { 575,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,11}, { 447,10}, { 895,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 575,12}, \ + { 319,11}, { 703,10}, { 1407,12}, { 383,11}, \ + { 831,12}, { 447,11}, { 895,10}, { 1791,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 575,11}, { 1215,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1535,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1919,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 152 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 5240 + +#define DC_DIV_QR_THRESHOLD 31 +#define DC_DIVAPPR_Q_THRESHOLD 108 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 149 +#define INV_APPR_THRESHOLD 125 + +#define BINV_NEWTON_THRESHOLD 156 +#define REDC_1_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 807 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 66 +#define MU_BDIV_QR_THRESHOLD 667 +#define MU_BDIV_Q_THRESHOLD 807 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD_THRESHOLD 87 +#define GCD_DC_THRESHOLD 233 +#define GCDEXT_DC_THRESHOLD 198 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 390 +#define SET_STR_PRECOMPUTE_THRESHOLD 814 diff --git a/gmp-6.3.0/mpn/powerpc32/750/lshift.asm b/gmp-6.3.0/mpn/powerpc32/750/lshift.asm new file mode 100644 index 0000000..3a1c1a7 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/750/lshift.asm @@ -0,0 +1,155 @@ +dnl PowerPC 750 mpn_lshift -- mpn left shift. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 750: 3.0 +C 7400: 3.0 + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C This code is the same per-limb speed as mpn/powerpc32/lshift.asm, but +C smaller and saving about 30 or so cycles of overhead. + +ASM_START() +PROLOGUE(mpn_lshift) + + C r3 dst + C r4 src + C r5 size + C r6 shift + + mtctr r5 C size + slwi r5, r5, 2 C 4*size + + subfic r7, r6, 32 C 32-shift + add r4, r4, r5 C &src[size] + + add r5, r3, r5 C &dst[size] + lwz r8, -4(r4) C src[size-1] + bdz L(one) + + lwzu r9, -8(r4) C src[size-2] + + srw r3, r8, r7 C return value + slw r8, r8, r6 C src[size-1] << shift + bdz L(two) + + +L(top): + C r3 return value + C r4 src, incrementing + C r5 dst, incrementing + C r6 lshift + C r7 32-shift + C r8 src[i+1] << shift + C r9 src[i] + C r10 + + lwzu r10, -4(r4) + srw r11, r9, r7 + + or r8, r8, r11 + stwu r8, -4(r5) + + slw r8, r9, r6 + bdz L(odd) + + C r8 src[i+1] << shift + C r9 + C r10 src[i] + + lwzu r9, -4(r4) + srw r11, r10, r7 + + or r8, r8, r11 + stwu r8, -4(r5) + + slw r8, r10, r6 + bdnz L(top) + + +L(two): + C r3 return value + C r4 + C r5 &dst[2] + C r6 shift + C r7 32-shift + C r8 src[1] << shift + C r9 src[0] + C r10 + + srw r11, r9, r7 + slw r12, r9, r6 C src[0] << shift + + or r8, r8, r11 + stw r12, -8(r5) C dst[0] + + stw r8, -4(r5) C dst[1] + blr + + +L(odd): + C r3 return value + C r4 + C r5 &dst[2] + C r6 shift + C r7 32-shift + C r8 src[1] << shift + C r9 + C r10 src[0] + + srw r11, r10, r7 + slw r12, r10, r6 + + or r8, r8, r11 + stw r12, -8(r5) C dst[0] + + stw r8, -4(r5) C dst[1] + blr + + +L(one): + C r5 &dst[1] + C r6 shift + C r7 32-shift + C r8 src[0] + + srw r3, r8, r7 C return value + slw r8, r8, r6 C src[size-1] << shift + + stw r8, -4(r5) C dst[0] + blr + +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/powerpc32/750/rshift.asm b/gmp-6.3.0/mpn/powerpc32/750/rshift.asm new file mode 100644 index 0000000..4825fee --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/750/rshift.asm @@ -0,0 +1,153 @@ +dnl PowerPC 750 mpn_rshift -- mpn right shift. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 750: 3.0 +C 7400: 3.0 + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C This code is the same per-limb speed as mpn/powerpc32/rshift.asm, but +C smaller and saving about 30 or so cycles of overhead. + +ASM_START() +PROLOGUE(mpn_rshift) + + C r3 dst + C r4 src + C r5 size + C r6 shift + + mtctr r5 C size + lwz r8, 0(r4) C src[0] + + subfic r7, r6, 32 C 32-shift + addi r5, r3, -4 C dst-4 + + slw r3, r8, r7 C return value + bdz L(one) + + lwzu r9, 4(r4) C src[1] + srw r8, r8, r6 C src[0] >> shift + bdz L(two) + + +L(top): + C r3 return value + C r4 src, incrementing + C r5 dst, incrementing + C r6 shift + C r7 32-shift + C r8 src[i-1] >> shift + C r9 src[i] + C r10 + + lwzu r10, 4(r4) + slw r11, r9, r7 + + or r8, r8, r11 + stwu r8, 4(r5) + + srw r8, r9, r6 + bdz L(odd) + + C r8 src[i-1] >> shift + C r9 + C r10 src[i] + + lwzu r9, 4(r4) + slw r11, r10, r7 + + or r8, r8, r11 + stwu r8, 4(r5) + + srw r8, r10, r6 + bdnz L(top) + + +L(two): + C r3 return value + C r4 + C r5 &dst[size-2] + C r6 shift + C r7 32-shift + C r8 src[size-2] >> shift + C r9 src[size-1] + C r10 + + slw r11, r9, r7 + srw r12, r9, r6 C src[size-1] >> shift + + or r8, r8, r11 + stw r12, 8(r5) C dst[size-1] + + stw r8, 4(r5) C dst[size-2] + blr + + +L(odd): + C r3 return value + C r4 + C r5 &dst[size-2] + C r6 shift + C r7 32-shift + C r8 src[size-2] >> shift + C r9 + C r10 src[size-1] + + slw r11, r10, r7 + srw r12, r10, r6 + + or r8, r8, r11 + stw r12, 8(r5) C dst[size-1] + + stw r8, 4(r5) C dst[size-2] + blr + + +L(one): + C r3 return value + C r4 + C r5 dst-4 + C r6 shift + C r7 + C r8 src[0] + + srw r8, r8, r6 + + stw r8, 4(r5) C dst[0] + blr + +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/powerpc32/README b/gmp-6.3.0/mpn/powerpc32/README new file mode 100644 index 0000000..887e78b --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/README @@ -0,0 +1,180 @@ +Copyright 2002, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + POWERPC 32-BIT MPN SUBROUTINES + + +This directory contains mpn functions for various 32-bit PowerPC chips. + + +CODE ORGANIZATION + + directory used for + ================================================ + powerpc generic, 604, 604e, 744x, 745x + powerpc/750 740, 750, 7400, 7410 + + +The top-level powerpc directory is currently mostly aimed at 604/604e but +should be reasonable on all powerpcs. + + + +STATUS + +The code is quite well optimized for the 604e, other chips have had less +attention. + +Altivec SIMD available in 74xx might hold some promise, but unfortunately +GMP only guarantees 32-bit data alignment, so there's lots of fiddling +around with partial operations at the start and end of limb vectors. A +128-bit limb would be a novel idea, but is unlikely to be practical, since +it would have to work with ordinary +, -, * etc in the C code. + +Also, Altivec isn't very well suited for the GMP multiplication needs. +Using floating-point based multiplication has much better better performance +potential for all current powerpcs, both the ones with slow integer multiply +units (603, 740, 750, 7400, 7410) and those with fast (604, 604e, 744x, +745x). This is because all powerpcs do some level of pipelining in the FPU: + +603 and 750 can sustain one fmadd every 2nd cycle. +604 and 604e can sustain one fmadd per cycle. +7400 and 7410 can sustain 3 fmadd in 4 cycles. +744x and 745x can sustain 4 fmadd in 5 cycles. + + + +REGISTER NAMES + +The normal powerpc convention is to give registers as plain numbers, like +"mtctr 6", but on Apple MacOS X (powerpc*-*-rhapsody* and +powerpc*-*-darwin*) the assembler demands an "r" like "mtctr r6". Note +however when register 0 in an instruction means a literal zero the "r" is +omitted, for instance "lwzx r6,0,r7". + +The GMP code uses the "r" forms, powerpc-defs.m4 transforms them to plain +numbers according to what GMP_ASM_POWERPC_R_REGISTERS finds is needed. +(Note that this style isn't fully general, as the identifier r4 and the +register r4 will not be distinguishable on some systems. However, this is +not a problem for the limited GMP assembly usage.) + + + +GLOBAL REFERENCES + +Linux non-PIC + lis 9, __gmp_binvert_limb_table@ha + rlwinm 11, 5, 31, 25, 31 + la 9, __gmp_binvert_limb_table@l(9) + lbzx 11, 9, 11 + +Linux PIC (FIXME) +.LCL0: + .long .LCTOC1-.LCF0 + bcl 20, 31, .LCF0 +.LCF0: + mflr 30 + lwz 7, .LCL0-.LCF0(30) + add 30, 7, 30 + lwz 11, .LC0-.LCTOC1(30) + rlwinm 3, 5, 31, 25, 31 + lbzx 7, 11, 3 + +AIX (always PIC) +LC..0: + .tc __gmp_binvert_limb_table[TC],__gmp_binvert_limb_table[RW] + lwz 9, LC..0(2) + rlwinm 0, 5, 31, 25, 31 + lbzx 0, 9, 0 + +Darwin (non-PIC) + lis r2, ha16(___gmp_binvert_limb_table) + rlwinm r9, r5, 31, 25, 31 + la r2, lo16(___gmp_binvert_limb_table)(r2) + lbzx r0, r2, r9 +Darwin (PIC) + mflr r0 + bcl 20, 31, L0001$pb +L0001$pb: + mflr r7 + mtlr r0 + addis r2, r7, ha16(L___gmp_binvert_limb_table$non_lazy_ptr-L0001$pb) + rlwinm r9, r5, 31, 25, 31 + lwz r2, lo16(L___gmp_binvert_limb_table$non_lazy_ptr-L0001$pb)(r2) + lbzx r0, r2, r9 +------ + .non_lazy_symbol_pointer +L___gmp_binvert_limb_table$non_lazy_ptr: + .indirect_symbol ___gmp_binvert_limb_table + .long 0 + .subsections_via_symbols + + +For GNU/Linux and Darwin, we might want to duplicate __gmp_binvert_limb_table +into the text section in this file. We should thus be able to reach it like +this: + + blr L0 +L0: mflr r2 + rlwinm r9, r5, 31, 25, 31 + addi r9, r9, lo16(local_binvert_table-L0) + lbzx r0, r2, r9 + + + +REFERENCES + +PowerPC Microprocessor Family: The Programming Environments for 32-bit +Microprocessors, IBM document G522-0290-01, 2000. + +PowerPC 604e RISC Microprocessor User's Manual with Supplement for PowerPC +604 Microprocessor, IBM document G552-0330-00, Freescale document +MPC604EUM/AD, 3/1998. + +MPC7410/MPC7400 RISC Microprocessor User's Manual, Freescale document +MPC7400UM/D, rev 1, 11/2002. + +MPC7450 RISC Microprocessor Family Reference Manual, Freescale document +MPC7450UM, rev 5, 1/2005. + +The above are available online from + + http://www.ibm.com/chips/techlib/techlib.nsf/productfamilies/PowerPC + http://www.freescale.com/PowerPC + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm b/gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm new file mode 100644 index 0000000..71645c3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/addlsh1_n.asm @@ -0,0 +1,100 @@ +dnl PowerPC-32 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) + +dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 4.0 +C 75x (G3): 5.0 +C 7400,7410 (G4): 5.0 +C 744x,745x (G4+): 5.0 +C power4/ppc970: 4.25 +C power5: 5.0 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +define(`rp',`r3') +define(`up',`r4') +define(`vp',`r5') + +define(`s0',`r6') +define(`s1',`r7') +define(`u0',`r8') +define(`v0',`r10') +define(`v1',`r11') + +ASM_START() +PROLOGUE(mpn_addlsh1_n) + mtctr r6 C copy n in ctr + addic r31, r31, 0 C clear cy + + lwz v0, 0(vp) C load v limb + lwz u0, 0(up) C load u limb + addi up, up, -4 C update up + addi rp, rp, -4 C update rp + slwi s1, v0, 1 + bdz L(end) C If done, skip loop + +L(loop): + lwz v1, 4(vp) C load v limb + adde s1, s1, u0 C add limbs with cy, set cy + srwi s0, v0, 31 C shift down previous v limb + stw s1, 4(rp) C store result limb + lwzu u0, 8(up) C load u limb and update up + rlwimi s0, v1, 1, 0,30 C left shift v limb and merge with prev v limb + + bdz L(exit) C decrement ctr and exit if done + + lwzu v0, 8(vp) C load v limb and update vp + adde s0, s0, u0 C add limbs with cy, set cy + srwi s1, v1, 31 C shift down previous v limb + stwu s0, 8(rp) C store result limb and update rp + lwz u0, 4(up) C load u limb + rlwimi s1, v0, 1, 0,30 C left shift v limb and merge with prev v limb + + bdnz L(loop) C decrement ctr and loop back + +L(end): adde r7, s1, u0 + srwi r4, v0, 31 + stw r7, 4(rp) C store last result limb + addze r3, r4 + blr +L(exit): + adde r7, s0, u0 + srwi r4, v1, 31 + stw r7, 8(rp) C store last result limb + addze r3, r4 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/addmul_1.asm b/gmp-6.3.0/mpn/powerpc32/addmul_1.asm new file mode 100644 index 0000000..07486df --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/addmul_1.asm @@ -0,0 +1,159 @@ +dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1995, 1997, 1998, 2000-2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 6.75 +C 75x (G3): 8.7-14.3 +C 7400,7410 (G4): 8.7-14.3 +C 744x,745x (G4+): 9.5 +C power4/ppc970: 6.25 +C power5: 6.25 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C vl r6 + +C This is optimized for the PPC604. It has not been tuned for other +C PowerPC processors. +C +C Loop Analysis for the 604: +C 12 mem insn +C 8 serializing insn +C 8 int multiply +C 25 int reg write +C 9 int ops (8 of which serialize) +C +C The multiply insns need 16 cycles/4limb. +C The integer register writes will need 13 cycles/4limb. +C All-in-all, it should be possible to get to 4 or 5 cycles/limb on PPC604, +C but that will require some clever FPNOPS and BNOPS for exact +C issue control. + + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpwi cr0,r5,9 C more than 9 limbs? + bgt cr0,L(big) C branch if more than 9 limbs + + mtctr r5 + lwz r0,0(r4) + mullw r7,r0,r6 + mulhwu r10,r0,r6 + lwz r9,0(r3) + addc r8,r7,r9 + addi r3,r3,-4 + bdz L(end) +L(loop): + lwzu r0,4(r4) + stwu r8,4(r3) + mullw r8,r0,r6 + adde r7,r8,r10 + mulhwu r10,r0,r6 + lwz r9,4(r3) + addze r10,r10 + addc r8,r7,r9 + bdnz L(loop) +L(end): stw r8,4(r3) + addze r3,r10 + blr + +L(big): stwu r1,-16(r1) + addi r5,r5,-1 + stw r30,8(r1) + srwi r0,r5,2 + stw r31,12(r1) + mtctr r0 + + lwz r7,0(r4) + mullw r8,r7,r6 + mulhwu r0,r7,r6 + lwz r7,0(r3) + addc r8,r8,r7 + stw r8,0(r3) + +L(loopU): + lwz r7,4(r4) + lwz r12,8(r4) + lwz r30,12(r4) + lwzu r31,16(r4) + mullw r8,r7,r6 + mullw r9,r12,r6 + mullw r10,r30,r6 + mullw r11,r31,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + adde r9,r9,r0 + mulhwu r0,r12,r6 + lwz r12,8(r3) + adde r10,r10,r0 + mulhwu r0,r30,r6 + lwz r30,12(r3) + adde r11,r11,r0 + mulhwu r0,r31,r6 + lwz r31,16(r3) + addze r0,r0 C new cy_limb + addc r8,r8,r7 + stw r8,4(r3) + adde r9,r9,r12 + stw r9,8(r3) + adde r10,r10,r30 + stw r10,12(r3) + adde r11,r11,r31 + stwu r11,16(r3) + bdnz L(loopU) + + andi. r31,r5,3 + mtctr r31 + beq cr0,L(endx) + +L(loopE): + lwzu r7,4(r4) + mullw r8,r7,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + addze r0,r0 C new cy_limb + addc r8,r8,r7 + stwu r8,4(r3) + bdnz L(loopE) +L(endx): + addze r3,r0 + lwz r30,8(r1) + lwz r31,12(r1) + addi r1,r1,16 + blr +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/powerpc32/aix.m4 b/gmp-6.3.0/mpn/powerpc32/aix.m4 new file mode 100644 index 0000000..fde2020 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/aix.m4 @@ -0,0 +1,82 @@ +divert(-1) +dnl m4 macros for AIX 32-bit assembly. + +dnl Copyright 2000-2002, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START', +` .toc') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl Don't want ELF style .size in the epilogue. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) + ` + .globl $1 + .globl .$1 + .csect [DS], 2 +$1: + .long .$1, TOC[tc0], 0 + .csect [PR] + .align 2 +.$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +`') + +define(`TOC_ENTRY', `') + +define(`LEA', +m4_assert_numargs(2) +`define(`TOC_ENTRY', +` .toc +tc$2: + .tc $2[TC], $2')' +` lwz $1, tc$2(2)') + +define(`EXTERN', +m4_assert_numargs(1) +` .globl $1') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` .csect [RO], 3 + ALIGN(ifelse($#,1,2,$2)) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1)) + +define(`ASM_END', `TOC_ENTRY') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/aors_n.asm b/gmp-6.3.0/mpn/powerpc32/aors_n.asm new file mode 100644 index 0000000..25ece09 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/aors_n.asm @@ -0,0 +1,157 @@ +dnl PowerPC-32 mpn_add_n and mpn_sub_n. + +dnl Copyright 2002, 2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? old: 3.25 +C 75x (G3): ? old: 3.5 +C 7400,7410 (G4): 3.25 +C 744x,745x (G4+): 4 +C POWER3/PPC630 2 +C POWER4/PPC970 2.4 +C POWER5 2.75 +C POWER6 40-140 +C POWER7 3 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') +define(`cy', `r7') + +ifdef(`OPERATION_add_n', ` + define(ADCSBC, adde) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(IFADD, `$1') + define(IFSUB, `')') +ifdef(`OPERATION_sub_n', ` + define(ADCSBC, subfe) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(IFADD, `') + define(IFSUB, `$1')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() + +PROLOGUE(func_nc) +IFADD(` addic r0, cy, -1') C set carry from argument +IFSUB(` subfic r0, cy, 0') C set carry from argument + b L(ent) +EPILOGUE() + +PROLOGUE(func) +IFADD(` addic r0, n, 0') C clear carry +IFSUB(` addic r0, n, -1') C set carry +L(ent): andi. r0, n, 3 + addi r3, r3, -12 + addi n, n, 1 + cmpwi cr7, r0, 2 + srwi r0, n, 2 + sub r4, r4, r3 + sub r5, r5, r3 + mtctr r0 + bne cr0, L(n00) + + lwzx r7, r4, r3 C n = 4, 8, 12, ... + lwzx r8, r5, r3 + addi r3, r3, 4 + lwzx r9, r4, r3 + ADCSBC r7, r8, r7 + lwzx r10, r5, r3 + addi r3, r3, 4 + b L(00) + +L(n00): bge cr7, L(n01) + cmpwi cr0, r0, 0 C n = 1, 5, 9, 13, ... + lwzx r0, r4, r3 + lwzx r6, r5, r3 + addi r3, r3, 4 + ADCSBC r0, r6, r0 + ble L(ret) +L(gt1): lwzx r7, r4, r3 + lwzx r8, r5, r3 + addi r3, r3, 4 + b L(01) + +L(n10): + lwzx r9, r4, r3 C n = 3, 7, 11, 15, ... + lwzx r10, r5, r3 + addi r3, r3, 4 + lwzx r11, r4, r3 + ADCSBC r9, r10, r9 + lwzx r12, r5, r3 + addi r3, r3, 4 + b L(11) + +L(n01): bne cr7, L(n10) + cmpwi cr0, r0, 0 C n = 2, 6, 10, 14, ... + lwzx r11, r4, r3 + lwzx r12, r5, r3 + addi r3, r3, 4 + lwzx r0, r4, r3 + ADCSBC r11, r12, r11 + lwzx r6, r5, r3 + addi r3, r3, 4 + ble cr0, L(end) + + +L(lp): lwzx r7, r4, r3 + ADCSBC r0, r6, r0 + lwzx r8, r5, r3 + stwu r11, 4(r3) +L(01): lwzx r9, r4, r3 + ADCSBC r7, r8, r7 + lwzx r10, r5, r3 + stwu r0, 4(r3) +L(00): lwzx r11, r4, r3 + ADCSBC r9, r10, r9 + lwzx r12, r5, r3 + stwu r7, 4(r3) +L(11): lwzx r0, r4, r3 + ADCSBC r11, r12, r11 + lwzx r6, r5, r3 + stwu r9, 4(r3) + bdnz L(lp) + +L(end): ADCSBC r0, r6, r0 + stw r11, 4(r3) +L(ret): stw r0, 8(r3) +IFADD(` li r3, 0 ') +IFADD(` addze r3, r3 ') +IFSUB(` subfe r3, r0, r0') +IFSUB(` neg r3, r3') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm new file mode 100644 index 0000000..72b2c48 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/bdiv_dbm1c.asm @@ -0,0 +1,131 @@ +dnl PPC32 mpn_bdiv_dbm1c. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? +C 75x (G3): ? +C 7400,7410 (G4): 9.43 +C 744x,745x (G4+): 6.28 +C power4/ppc970: ? +C power5: ? + +C TODO +C * Nothing to do... + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`bd', `r6') +define(`cy', `r7') + +ASM_START() +PROLOGUE(mpn_bdiv_dbm1c) + lwz r0, 0(r4) + + rlwinm. r12, r5, 0,30,31 + cmplwi cr6, r12, 2 + cmplwi cr7, r5, 4 + addi r5, r5, 1 + srwi r5, r5, 2 + mtctr r5 + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): mullw r5, r0, r6 + mulhwu r12, r0, r6 + lwz r0, 4(r4) + addi r4, r4, -12 + addi r3, r3, -12 + b L(3) + +L(b00): mullw r9, r0, r6 + mulhwu r8, r0, r6 + lwz r0, 4(r4) + addi r4, r4, -8 + addi r3, r3, -8 + b L(0) + +L(b01): mullw r5, r0, r6 + mulhwu r12, r0, r6 + addi r3, r3, -4 + ble cr7, L(e1) + lwz r0, 4(r4) + addi r4, r4, -4 + b L(1) + +L(b10): mullw r9, r0, r6 + mulhwu r8, r0, r6 + lwz r0, 4(r4) + ble cr7, L(e2) + + ALIGN(16) +L(top): mullw r5, r0, r6 + mulhwu r12, r0, r6 + subfc r11, r9, r7 + lwz r0, 8(r4) + subfe r7, r8, r11 + stw r11, 0(r3) +L(1): mullw r9, r0, r6 + mulhwu r8, r0, r6 + subfc r11, r5, r7 + lwz r0, 12(r4) + subfe r7, r12, r11 + stw r11, 4(r3) +L(0): mullw r5, r0, r6 + mulhwu r12, r0, r6 + subfc r11, r9, r7 + lwz r0, 16(r4) + subfe r7, r8, r11 + stw r11, 8(r3) +L(3): mullw r9, r0, r6 + mulhwu r8, r0, r6 + subfc r11, r5, r7 + lwz r0, 20(r4) + subfe r7, r12, r11 + stw r11, 12(r3) + addi r4, r4, 16 + addi r3, r3, 16 + bdnz L(top) + +L(e2): mullw r5, r0, r6 + mulhwu r12, r0, r6 + subfc r11, r9, r7 + subfe r7, r8, r11 + stw r11, 0(r3) +L(e1): subfc r11, r5, r7 + stw r11, 4(r3) + subfe r3, r12, r11 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/darwin.m4 b/gmp-6.3.0/mpn/powerpc32/darwin.m4 new file mode 100644 index 0000000..db42268 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/darwin.m4 @@ -0,0 +1,91 @@ +divert(-1) +dnl m4 macros for Mac OS 32-bit assembly. + +dnl Copyright 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START',`') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',toc,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl + .text + .globl $1 + .align 3 +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1)) + + +dnl LEA -- Load Effective Address. + +define(`LEA', +m4_assert_numargs(2) +`ifdef(`PIC', +` mflr r0 C save return address + bcl 20, 31, 1f +1: mflr $1 + addis $1, $1, ha16($2-1b) + la $1, lo16($2-1b)($1) + mtlr r0 C restore return address +',` + lis $1, ha16($2) + la $1, lo16($2)($1) +')') + +define(`LEAL', +m4_assert_numargs(2) +`LEA($1,$2)') + + +define(`EXTERN', +m4_assert_numargs(1) +`dnl') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` .const + ALIGN(ifelse($#,1,2,$2)) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1)) + +define(`ASM_END', `dnl') + +ifdef(`PIC',` +define(`PIC_SLOW')') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/diveby3.asm b/gmp-6.3.0/mpn/powerpc32/diveby3.asm new file mode 100644 index 0000000..288a7d3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/diveby3.asm @@ -0,0 +1,93 @@ +dnl PowerPC-32 mpn_divexact_by3 -- mpn by 3 exact division + +dnl Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 5 +C 75x (G3): ? +C 7400,7410 (G4): 8 +C 744x,745x (G4+): 6 +C power4/ppc970: 12 +C power5: ? + +C void mpn_divexact_by3 (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C We avoid the slow subfe instruction and instead rely on an extremely unlikely +C branch. +C +C The mullw has the inverse in the first operand, since 0xAA..AB won't allow +C any early-out. The src[] data normally won't either, but there's at least +C a chance, whereas 0xAA..AB never will. If, for instance, src[] is all +C zeros (not a sensible input of course) we run at 7.0 c/l on ppc750. +C +C The mulhwu has the "3" multiplier in the second operand, which lets 750 and +C 7400 use an early-out. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cy', `r6') + +ASM_START() +PROLOGUE(mpn_divexact_by3c) + lwz r11, 0(up) + mtctr n + lis r12, 0xAAAA + ori r12, r12, 0xAAAB + li r10, 3 + + cmplw cr7, cy, r11 + subf r11, cy, r11 + + mullw r0, r11, r12 + stw r0, 0(rp) + bdz L(one) + +L(top): lwzu r9, 4(up) + mulhwu r7, r0, r10 + bgt- cr7, L(adj) C very unlikely branch +L(bko): cmplw cr7, r7, r9 + subf r0, r7, r9 + mullw r0, r12, r0 + stwu r0, 4(rp) + bdnz L(top) + +L(one): mulhwu r3, r0, r10 + blelr+ cr7 + addi r3, r3, 1 + blr + +L(adj): addi r7, r7, 1 + b L(bko) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc32/divrem_2.asm b/gmp-6.3.0/mpn/powerpc32/divrem_2.asm new file mode 100644 index 0000000..74423f4 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/divrem_2.asm @@ -0,0 +1,182 @@ +dnl PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm frac +C 7410 ~36.5 ~36.5 +C 744x, 745x 29 29 + +C INPUT PARAMETERS +C qp = r3 +C fn = r4 +C up = r5 +C un = r6 +C d = r7 + +C TODO +C * Decrease register usage. +C * Make sure mul operands and optimal for early-out. +C * Check that things work well for a shared library build. +C * Write an invert_limb, perhaps inline, perhaps as a private call. Or at +C least vastly improve the current __udiv_qrnnd_c based code. + + +ASM_START() +PROLOGUE(mpn_divrem_2) + stwu r1, -32(r1) + slwi r0, r6, 2 + add r5, r5, r0 + stmw r28, 8(r1) + addi r29, r5, -8 C up = up_param + un - 2 + lwz r10, 4(r7) + lwz r12, 4(r29) + addi r8, r3, -12 + lwz r7, 0(r7) + cmplw cr7, r12, r10 + lwz r28, 0(r29) + blt- cr7, L(2) + bgt+ cr7, L(4) + cmplw cr7, r28, r7 + blt- cr7, L(2) +L(4): subfc r28, r7, r28 + subfe r12, r10, r12 + li r3, 1 + b L(6) +L(2): li r3, 0 + +L(6): add r0, r4, r6 + addic. r30, r0, -2 + ble- cr0, L(ret) + + slwi r9, r0, 2 + add r8, r8, r9 C rp += un + fn + mtctr r30 + +C Compute di from d1 + srwi r11, r10, 16 + nor r0, r10, r10 + divwu r31, r0, r11 + rlwinm r5, r10, 0, 16, 31 + mullw r9, r11, r31 + mullw r6, r5, r31 + subf r0, r9, r0 + slwi r0, r0, 16 + ori r0, r0, 65535 + cmplw cr7, r0, r6 + bge- cr7, L(9) + add r0, r0, r10 + cmplw cr7, r0, r10 + cmplw cr6, r6, r0 + addi r31, r31, -1 C q1-- + crorc 28, 28, 25 + blt+ cr7, L(9) + addi r31, r31, -1 C q1-- + add r0, r0, r10 +L(9): subf r0, r6, r0 + divwu r6, r0, r11 + mullw r9, r11, r6 + mullw r11, r5, r6 + subf r0, r9, r0 + slwi r0, r0, 16 + ori r0, r0, 65535 + cmplw cr7, r0, r11 + bge- cr7, L(13) + add r0, r0, r10 + cmplw cr7, r0, r10 + cmplw cr6, r11, r0 + addi r6, r6, -1 C q0-- + crorc 28, 28, 25 + blt+ cr7, L(13) +C add r0, r0, r10 C final remainder + addi r6, r6, -1 C q0-- +L(13): rlwimi r6, r31, 16, 0, 15 C assemble final quotient + +C Adjust di by including d0 + mullw r9, r10, r6 C t0 = LO(di * d1) + addc r11, r9, r7 + subfe r0, r1, r1 + mulhwu r9, r6, r7 C s1 = HI(di * d0) + addc r9, r11, r9 + addze. r0, r0 + blt cr0, L(17) +L(18): subfc r9, r10, r9 + addi r6, r6, -1 + addme. r0, r0 + bge+ cr0, L(18) +L(17): + +C r0 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r28 r29 r30 r31 +C msl di d0 qp d1 fn up un +L(loop): + mullw r0, r12, r6 C q0 = LO(n2 * di) + cmpw cr7, r30, r4 + addc r31, r0, r28 C q0 += n1 + mulhwu r9, r12, r6 C q = HI(n2 * di) + adde r12, r9, r12 C q += n2 + addi r30, r30, -1 + mullw r0, r10, r12 C d1 * q + li r9, 0 + subf r0, r0, r28 C n1 -= d1 * q + addi r5, r12, 1 + ble- cr7, L(23) + lwzu r9, -4(r29) +L(23): mullw r11, r12, r7 C t0 = LO(d0 * q) + subfc r28, r7, r9 C n0 -= d0 + subfe r0, r10, r0 C n1 -= d1 + mulhwu r12, r12, r7 C t1 = HI(d0 * q) + subfc r28, r11, r28 C n0 -= t0 + subfe r12, r12, r0 C n1 -= t1 + cmplw cr7, r12, r31 + blt+ cr7, L(24) + addc r28, r28, r7 + adde r12, r12, r10 + addi r5, r5, -1 +L(24): cmplw cr7, r12, r10 + bge- cr7, L(fix) +L(bck): stw r5, 0(r8) + addi r8, r8, -4 + bdnz L(loop) + +L(ret): stw r28, 0(r29) + stw r12, 4(r29) + lmw r28, 8(r1) + addi r1, r1, 32 + blr + +L(fix): cmplw cr6, r28, r7 + bgt+ cr7, L(28) + blt- cr6, L(bck) +L(28): subfc r28, r7, r28 + subfe r12, r10, r12 + addi r5, r5, 1 + b L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/eabi.m4 b/gmp-6.3.0/mpn/powerpc32/eabi.m4 new file mode 100644 index 0000000..cd7633c --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/eabi.m4 @@ -0,0 +1,86 @@ +divert(-1) +dnl m4 macros for powerpc32 eABI assembly. + +dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START',`') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) + ` + .section ".text" + .align 3 + .globl $1 + .type $1, @function +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .size $1, .-$1') + +dnl This ought to support PIC, but it is unclear how that is done for eABI +define(`LEA', +m4_assert_numargs(2) +` + lis $1, $2@ha + la $1, $2@l($1) +') + +define(`EXTERN', +m4_assert_numargs(1) +`dnl') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` + .section .rodata + ALIGN(ifelse($#,1,2,$2)) + .type $1, @object +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1) +` .size $1, .-$1') + +define(`ASM_END', `dnl') + +ifdef(`PIC',` +define(`PIC_SLOW')') + +dnl 64-bit "long long" parameters are put in an even-odd pair, skipping an +dnl even register if that was in turn. I wish somebody could explain why that +dnl is a good idea. +define(`BROKEN_LONGLONG_PARAM') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/elf.m4 b/gmp-6.3.0/mpn/powerpc32/elf.m4 new file mode 100644 index 0000000..1ed9c12 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/elf.m4 @@ -0,0 +1,100 @@ +divert(-1) +dnl m4 macros for powerpc32 GNU/Linux assembly. + +dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START',`') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',toc,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl + .section ".text" + .align 3 + .globl $1 + .type $1, @function +$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +` .size $1, .-$1') + +define(`LEA', +m4_assert_numargs(2) +`ifdef(`PIC',` + mflr r0 + bcl 20, 31, 1f +1: mflr $1 + addis $1, $1, (_GLOBAL_OFFSET_TABLE_-1b)@ha + addi $1, $1, (_GLOBAL_OFFSET_TABLE_-1b)@l + mtlr r0 + lwz $1, $2@got($1) +',` + lis $1, $2@ha + la $1, $2@l($1) +')') + + +define(`LEAL', +m4_assert_numargs(2) +`LEA($1,$2)') + + +define(`EXTERN', +m4_assert_numargs(1) +`dnl') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` + .section .rodata + ALIGN(ifelse($#,1,2,$2)) + .type $1, @object +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1) +` .size $1, .-$1') + +define(`ASM_END', `dnl') + +ifdef(`PIC',` +define(`PIC_SLOW')') + +dnl 64-bit "long long" parameters are put in an even-odd pair, skipping an +dnl even register if that was in turn. I wish somebody could explain why that +dnl is a good idea. +define(`BROKEN_LONGLONG_PARAM') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/gmp-mparam.h new file mode 100644 index 0000000..e835a39 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/gmp-mparam.h @@ -0,0 +1,222 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2010, 2014, 2015 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* This file is supposed to be used for 604, 604e, 744x/745x/747x (G4+), i.e., + 32-bit PowerPC processors with reasonably fast integer multiply insns. The + values below are chosen to be best for the latter processors, since 604 is + largely irrelevant today. + + In mpn/powerpc32/750/gmp-mparam.h there are values for 75x (G3) and for + 7400/7410 (G4), both which have much slower multiply instructions. */ + +/* 1417 MHz PPC 7447A */ +/* FFT tuning limit = 15 M */ +/* Generated by tuneup.c, 2015-10-08, gcc 4.6 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 45 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 69 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 106 +#define MUL_TOOM6H_THRESHOLD 156 +#define MUL_TOOM8H_THRESHOLD 236 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 71 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 74 +#define SQR_TOOM4_THRESHOLD 142 +#define SQR_TOOM6_THRESHOLD 190 +#define SQR_TOOM8_THRESHOLD 333 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 284 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 284, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \ + { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \ + { 71, 8}, { 143, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135, 8}, { 271, 9}, { 143,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \ + { 95, 9}, { 191, 8}, { 383, 9}, { 207, 8}, \ + { 415,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159, 9}, { 319,10}, { 175,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415, 8}, { 831,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,10}, { 479, 9}, { 959,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 351,10}, { 703, 9}, { 1407,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 703,10}, { 1407,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \ + { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1599,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2431,13}, { 1407,14}, { 767,13}, { 1535,12}, \ + { 3071,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 164 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 248 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 248, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 33, 8}, { 19, 7}, { 39, 8}, { 27, 9}, \ + { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 7}, { 511, 9}, { 143,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \ + { 351,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207, 8}, { 415, 7}, { 831,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415, 8}, { 831,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 415, 9}, { 831,11}, { 223,10}, { 447, 9}, \ + { 895,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 351,10}, { 703, 9}, { 1407,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,11}, \ + { 447,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 703,10}, \ + { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1279,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1599,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3199,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 157 +#define SQR_FFT_THRESHOLD 2688 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 50 +#define MULLO_MUL_N_THRESHOLD 6633 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 115 +#define SQRLO_SQR_THRESHOLD 5274 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 141 +#define DC_BDIV_QR_THRESHOLD 51 +#define DC_BDIV_Q_THRESHOLD 120 + +#define INV_MULMOD_BNM1_THRESHOLD 43 +#define INV_NEWTON_THRESHOLD 173 +#define INV_APPR_THRESHOLD 156 + +#define BINV_NEWTON_THRESHOLD 204 +#define REDC_1_TO_REDC_N_THRESHOLD 51 + +#define MU_DIV_QR_THRESHOLD 1017 +#define MU_DIVAPPR_Q_THRESHOLD 1078 +#define MUPI_DIV_QR_THRESHOLD 84 +#define MU_BDIV_QR_THRESHOLD 872 +#define MU_BDIV_Q_THRESHOLD 1078 + +#define POWM_SEC_TABLE 1,16,102,428,1378 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 781 +#define SET_STR_PRECOMPUTE_THRESHOLD 1505 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 12 +#define HGCD_THRESHOLD 118 +#define HGCD_APPR_THRESHOLD 161 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 351 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc32/invert_limb.asm b/gmp-6.3.0/mpn/powerpc32/invert_limb.asm new file mode 100644 index 0000000..612bfe5 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/invert_limb.asm @@ -0,0 +1,142 @@ +dnl PowerPC-32 mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? +C 75x (G3): ? +C 7400,7410 (G4): ? +C 744x,745x (G4+): 32 +C power4/ppc970: ? +C power5: ? + +EXTERN(approx_tab) + +ASM_START() +PROLOGUE(mpn_invert_limb) + rlwinm r6, r3, 11, 22, 30 C extract bits 30..22 to pos 2^1 + srwi r10, r3, 11 C extract bits 31..11 + LEA( r9, approx_tab) C N.B. clobbers r0 for ELF and Darwin + lhzx r9, r9, r6 C w2 + addi r0, r10, 1 + mullw r11, r9, r9 + slwi r9, r9, 4 + mulhwu r7, r11, r0 + rlwinm r11, r3, 0, 31, 31 C extract bit 0 + addi r0, r9, -1 + srwi r9, r3, 1 C d >> 1 + subf r0, r7, r0 C w1 + add r9, r9, r11 C d31 + mullw r9, r0, r9 C w1 * d31 + srwi r10, r0, 1 C w1 >> 1 + neg r11, r11 + and r11, r10, r11 + subf r11, r9, r11 + mulhwu r9, r11, r0 + slwi r0, r0, 15 + srwi r9, r9, 1 + add r0, r9, r0 C w0 + mullw r10, r0, r3 + mulhwu r9, r0, r3 + addc r11, r10, r3 + adde r3, r9, r3 + subf r3, r3, r0 + blr +EPILOGUE() + +DEF_OBJECT(approx_tab) + .short 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27 + .short 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d + .short 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61 + .short 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894 + .short 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3 + .short 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520 + .short 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379 + .short 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de + .short 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e + .short 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8 + .short 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e + .short 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd + .short 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76 + .short 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918 + .short 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3 + .short 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676 + .short 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532 + .short 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5 + .short 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1 + .short 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193 + .short 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d + .short 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d + .short 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35 + .short 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22 + .short 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16 + .short 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10 + .short 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f + .short 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914 + .short 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f + .short 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e + .short 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643 + .short 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d + .short 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b + .short 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e + .short 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6 + .short 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1 + .short 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121 + .short 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056 + .short 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e + .short 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca + .short 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09 + .short 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d + .short 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93 + .short 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde + .short 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b + .short 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c + .short 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0 + .short 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927 + .short 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881 + .short 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de + .short 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e + .short 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1 + .short 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606 + .short 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e + .short 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8 + .short 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445 + .short 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5 + .short 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327 + .short 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b + .short 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211 + .short 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a + .short 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104 + .short 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081 + .short 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000 +END_OBJECT(approx_tab) +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc32/lshift.asm b/gmp-6.3.0/mpn/powerpc32/lshift.asm new file mode 100644 index 0000000..ce85d4d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/lshift.asm @@ -0,0 +1,168 @@ +dnl PowerPC-32 mpn_lshift -- Shift a number left. + +dnl Copyright 1995, 1998, 2000, 2002-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 3.0 +C 75x (G3): 3.0 +C 7400,7410 (G4): 3.0 +C 7445,7455 (G4+): 2.5 +C 7447,7457 (G4+): 2.25 +C power4/ppc970: 2.5 +C power5: 2.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C cnt r6 + +ASM_START() +PROLOGUE(mpn_lshift) + cmpwi cr0, r5, 30 C more than 30 limbs? + slwi r0, r5, 2 + add r4, r4, r0 C make r4 point at end of s1 + add r7, r3, r0 C make r7 point at end of res + bgt L(BIG) C branch if more than 12 limbs + + mtctr r5 C copy size into CTR + subfic r8, r6, 32 + lwzu r11, -4(r4) C load first s1 limb + srw r3, r11, r8 C compute function return value + bdz L(end1) + +L(oop): lwzu r10, -4(r4) + slw r9, r11, r6 + srw r12, r10, r8 + or r9, r9, r12 + stwu r9, -4(r7) + bdz L(end2) + lwzu r11, -4(r4) + slw r9, r10, r6 + srw r12, r11, r8 + or r9, r9, r12 + stwu r9, -4(r7) + bdnz L(oop) + +L(end1): + slw r0, r11, r6 + stw r0, -4(r7) + blr +L(end2): + slw r0, r10, r6 + stw r0, -4(r7) + blr + +L(BIG): + stwu r1, -48(r1) + stmw r24, 8(r1) C save registers we are supposed to preserve + lwzu r9, -4(r4) + subfic r8, r6, 32 + srw r3, r9, r8 C compute function return value + slw r0, r9, r6 + addi r5, r5, -1 + + andi. r10, r5, 3 C count for spill loop + beq L(e) + mtctr r10 + lwzu r28, -4(r4) + bdz L(xe0) + +L(loop0): + slw r12, r28, r6 + srw r24, r28, r8 + lwzu r28, -4(r4) + or r24, r0, r24 + stwu r24, -4(r7) + mr r0, r12 + bdnz L(loop0) C taken at most once! + +L(xe0): slw r12, r28, r6 + srw r24, r28, r8 + or r24, r0, r24 + stwu r24, -4(r7) + mr r0, r12 + +L(e): srwi r5, r5, 2 C count for unrolled loop + addi r5, r5, -1 + mtctr r5 + lwz r28, -4(r4) + lwz r29, -8(r4) + lwz r30, -12(r4) + lwzu r31, -16(r4) + +L(loopU): + slw r9, r28, r6 + srw r24, r28, r8 + lwz r28, -4(r4) + slw r10, r29, r6 + srw r25, r29, r8 + lwz r29, -8(r4) + slw r11, r30, r6 + srw r26, r30, r8 + lwz r30, -12(r4) + slw r12, r31, r6 + srw r27, r31, r8 + lwzu r31, -16(r4) + or r24, r0, r24 + stw r24, -4(r7) + or r25, r9, r25 + stw r25, -8(r7) + or r26, r10, r26 + stw r26, -12(r7) + or r27, r11, r27 + stwu r27, -16(r7) + mr r0, r12 + bdnz L(loopU) + + slw r9, r28, r6 + srw r24, r28, r8 + slw r10, r29, r6 + srw r25, r29, r8 + slw r11, r30, r6 + srw r26, r30, r8 + slw r12, r31, r6 + srw r27, r31, r8 + or r24, r0, r24 + stw r24, -4(r7) + or r25, r9, r25 + stw r25, -8(r7) + or r26, r10, r26 + stw r26, -12(r7) + or r27, r11, r27 + stw r27, -16(r7) + + stw r12, -20(r7) + lmw r24, 8(r1) C restore registers + addi r1, r1, 48 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/lshiftc.asm b/gmp-6.3.0/mpn/powerpc32/lshiftc.asm new file mode 100644 index 0000000..b683def --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/lshiftc.asm @@ -0,0 +1,170 @@ +dnl PowerPC-32 mpn_lshiftc. + +dnl Copyright 1995, 1998, 2000, 2002-2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 3.0 +C 75x (G3): 3.0 +C 7400,7410 (G4): 3.0 +C 7445,7455 (G4+): 2.5 +C 7447,7457 (G4+): 2.25 +C power4/ppc970: 2.5 +C power5: 2.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C cnt r6 + +ASM_START() +PROLOGUE(mpn_lshiftc) + cmpwi cr0, r5, 30 C more than 30 limbs? + slwi r0, r5, 2 + add r4, r4, r0 C make r4 point at end of s1 + add r7, r3, r0 C make r7 point at end of res + bgt L(BIG) C branch if more than 12 limbs + + mtctr r5 C copy size into CTR + subfic r8, r6, 32 + lwzu r11, -4(r4) C load first s1 limb + srw r3, r11, r8 C compute function return value + bdz L(end1) + +L(oop): lwzu r10, -4(r4) + slw r9, r11, r6 + srw r12, r10, r8 + nor r9, r9, r12 + stwu r9, -4(r7) + bdz L(end2) + lwzu r11, -4(r4) + slw r9, r10, r6 + srw r12, r11, r8 + nor r9, r9, r12 + stwu r9, -4(r7) + bdnz L(oop) + +L(end1): + slw r0, r11, r6 + nor r0, r0, r0 + stw r0, -4(r7) + blr +L(end2): + slw r0, r10, r6 + nor r0, r0, r0 + stw r0, -4(r7) + blr + +L(BIG): + stwu r1, -48(r1) + stmw r24, 8(r1) C save registers we are supposed to preserve + lwzu r9, -4(r4) + subfic r8, r6, 32 + srw r3, r9, r8 C compute function return value + slw r0, r9, r6 + addi r5, r5, -1 + + andi. r10, r5, 3 C count for spill loop + beq L(e) + mtctr r10 + lwzu r28, -4(r4) + bdz L(xe0) + +L(loop0): + slw r12, r28, r6 + srw r24, r28, r8 + lwzu r28, -4(r4) + nor r24, r0, r24 + stwu r24, -4(r7) + mr r0, r12 + bdnz L(loop0) C taken at most once! + +L(xe0): slw r12, r28, r6 + srw r24, r28, r8 + nor r24, r0, r24 + stwu r24, -4(r7) + mr r0, r12 + +L(e): srwi r5, r5, 2 C count for unrolled loop + addi r5, r5, -1 + mtctr r5 + lwz r28, -4(r4) + lwz r29, -8(r4) + lwz r30, -12(r4) + lwzu r31, -16(r4) + +L(loopU): + slw r9, r28, r6 + srw r24, r28, r8 + lwz r28, -4(r4) + slw r10, r29, r6 + srw r25, r29, r8 + lwz r29, -8(r4) + slw r11, r30, r6 + srw r26, r30, r8 + lwz r30, -12(r4) + slw r12, r31, r6 + srw r27, r31, r8 + lwzu r31, -16(r4) + nor r24, r0, r24 + stw r24, -4(r7) + nor r25, r9, r25 + stw r25, -8(r7) + nor r26, r10, r26 + stw r26, -12(r7) + nor r27, r11, r27 + stwu r27, -16(r7) + mr r0, r12 + bdnz L(loopU) + + slw r9, r28, r6 + srw r24, r28, r8 + slw r10, r29, r6 + srw r25, r29, r8 + slw r11, r30, r6 + srw r26, r30, r8 + slw r12, r31, r6 + srw r27, r31, r8 + nor r24, r0, r24 + stw r24, -4(r7) + nor r25, r9, r25 + stw r25, -8(r7) + nor r26, r10, r26 + stw r26, -12(r7) + nor r27, r11, r27 + stw r27, -16(r7) + nor r12, r12, r12 + stw r12, -20(r7) + lmw r24, 8(r1) C restore registers + addi r1, r1, 48 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm b/gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm new file mode 100644 index 0000000..6d7fe4d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/mod_34lsub1.asm @@ -0,0 +1,145 @@ +dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. + +dnl Copyright 2002, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 603e: ? +C 604e: 3 +C 75x (G3): 3 +C 7400,7410 (G4): 3 +C 744x,745x (G4+): 3 +C power4/ppc970: 2.5 +C power5: 2.5 + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C There seems no need to schedule the loads back, the code is still 3.0 c/l +C on 750/7400 no matter where they're placed. +C +C Alternatives: +C +C Fetching half words would allow add instead for accumulating, instead of +C adde and its serialization. An outer loop would be required though, since +C 2^16 halfwords can overflow. lhz+add would be 2.0 c/l, but if there's +C also a bdz or bdnz for each and a pointer update say every three limbs +C then the total would be 2.67 c/l which isn't much faster than the current +C simpler code. + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + + C r3 src + C r4 size + + mtctr r4 + addic r6, r3, 8 C &src[2], and clear CA + + lwz r3, 0(r3) C acc0 = src[0] + bdz L(done) + + lwz r4, -4(r6) C acc1 = src[1] + bdz L(two) + + lwz r5, 0(r6) C acc2 = src[2] + lis r7, 0 C no carry if just three limbs + + bdz L(three) + lis r7, 1 C 0x10000 carry pos + +L(top): + C r3 acc0 + C r4 acc1 + C r5 acc2 + C r6 src, incrementing + C r7 carry pos + + lwz r0, 4(r6) + adde r3, r3, r0 + bdz L(end0) + + lwz r0, 8(r6) + adde r4, r4, r0 + bdz L(end1) + + lwzu r0, 12(r6) + adde r5, r5, r0 + bdnz L(top) + + + srwi r7, r7, 8 +L(end0): + srwi r7, r7, 8 +L(end1): + subfe r0, r0, r0 C -1 if not CA + + andc r7, r7, r0 C final carry, 0x10000, 0x100, 1 or 0 +L(three): + rlwinm r6, r3, 0,8,31 C acc0 low + + add r7, r7, r6 + rlwinm r6, r3, 8,24,31 C acc0 high + + add r7, r7, r6 + rlwinm r6, r4, 8,8,23 C acc1 low + + add r7, r7, r6 + rlwinm r6, r4, 16,16,31 C acc1 high + + add r7, r7, r6 + rlwinm r6, r5, 16,8,15 C acc2 low + + add r7, r7, r6 + rlwinm r6, r5, 24,8,31 C acc2 high + + add r3, r7, r6 + +L(done): + blr + +L(two): + C r3 acc0 + C r4 acc1 + + rlwinm r5, r3, 8,24,31 C acc0 high + rlwinm r3, r3, 0,8,31 C acc0 low + + add r3, r3, r5 C acc0 high + low + rlwinm r5, r4, 16,16,31 C acc1 high + + add r3, r3, r5 C add acc1 high + rlwinm r5, r4, 8,8,23 C acc1 low + + add r3, r3, r5 C add acc1 low + + blr + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/mode1o.asm b/gmp-6.3.0/mpn/powerpc32/mode1o.asm new file mode 100644 index 0000000..e8a6b5e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/mode1o.asm @@ -0,0 +1,127 @@ +dnl PowerPC-32 mpn_modexact_1_odd -- mpn by limb exact remainder. + +dnl Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 603e: ? +C 604e: 6.0 +C 75x (G3): 6.0-13.0, depending on divisor +C 7400,7410 (G4): 6.0-13.0, depending on divisor +C 744x,745x (G4+): 8.0-10.0, depending on divisor +C power4/ppc970: 12.0 +C power5: 12.0 + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C For PIC, the inverse is established arithmetically since it measures about +C 5 cycles faster than the nonsense needed to access binvert_limb_table in +C SVR4 or Darwin style PIC. AIX might be better, since it avoids bl/mflr to +C get at the GOT/TOC/whatever. +C +C Using divwu for size==1 measured about 10 cycles slower on 604e, or about +C 3-5 cycles faster on 750. For now it doesn't seem worth bothering with. +C +C The loop allows an early-out on mullw for the inverse, and on mulhwu for +C the divisor. So the fastest is for instance divisor==1 (inverse==-1), and +C the slowest is anything giving a full 32-bits in both, such as +C divisor==0xDEADBEEF (inverse==0x904B300F). These establish the stated +C range above for 750 and 7400. + + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_modexact_1_odd) + li r6, 0 + +PROLOGUE(mpn_modexact_1c_odd) + + mtctr r4 C size + +ifdef(`PIC_SLOW',` +C Load from our table with PIC is so slow on Linux and Darwin that we avoid it + rlwinm r7, r5, 1,28,28 C (divisor << 1) & 8 + rlwinm r8, r5, 2,28,28 C (divisor << 2) & 8 + xor r7, r7, r8 C ((divisor << 1) ^ (divisor << 2)) & 8 + rlwinm r4, r5, 0,28,31 C divisor low 4 bits, speedup mullw + xor r4, r4, r7 C inverse, 4 bits + mullw r7, r4, r4 C i*i + slwi r4, r4, 1 C 2*i + rlwinm r8, r5, 0,24,31 C divisor low 8 bits, speedup mullw + mullw r7, r7, r8 C i*i*d + sub r4, r4, r7 C inverse, 8 bits +',` + LEA( r7, binvert_limb_table) + rlwinm r4, r5, 31,25,31 C (divisor/2) & 0x7F + lbzx r4, r4,r7 C inverse, 8 bits +') + + mullw r7, r4, r4 C i*i + slwi r4, r4, 1 C 2*i + mullw r7, r5, r7 C i*i*d [i*i is 16 bits, so second operand] + sub r4, r4, r7 C inverse, 16 bits + mullw r7, r4, r4 C i*i + slwi r4, r4, 1 C 2*i + mullw r7, r7, r5 C i*i*d + lwz r0, 0(r3) C src[0] + sub r4, r4, r7 C inverse, 32 bits + subfc r7, r6, r0 C l = src[0] - carry + + mullw r7, r7, r4 C q = l * inverse + bdz L(one) + + lwzu r0, 4(r3) C src[1] + mulhwu r6, r7, r5 C carry = high(q*divisor) + subfe r7, r6, r0 C l = src[1] - carry + bdz L(two) + +L(top): + mullw r7, r7, r4 C q = l * inverse + lwzu r0, 4(r3) C src[i] + mulhwu r6, r7, r5 C carry = high(q*divisor) + subfe r7, r6, r0 C l = src[i] - carry + bdnz L(top) + +L(two): mullw r7, r7, r4 C q = l * inverse +L(one): subfe r3, r3, r3 C ca 0 or -1 + mulhwu r6, r7, r5 C carry = high(q*divisor) + subf r3, r3, r6 C carry + ca + blr + +EPILOGUE(mpn_modexact_1c_odd) +EPILOGUE(mpn_modexact_1_odd) +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc32/mul_1.asm b/gmp-6.3.0/mpn/powerpc32/mul_1.asm new file mode 100644 index 0000000..e42087c --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/mul_1.asm @@ -0,0 +1,101 @@ +dnl PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 1995, 1997, 2000, 2002, 2003, 2005 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 4.0 +C 75x (G3): 4.5-11 +C 7400,7410 (G4): 4.5-11 +C 744x,745x (G4+): 6.0 +C power4/ppc970: 6.0 +C power5: 5.63 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C vl r6 + +ASM_START() +PROLOGUE(mpn_mul_1) + mtctr r5 + addi r3,r3,-4 C adjust res_ptr, it's offset before it's used + li r12,0 C clear upper product reg + addic r0,r0,0 C clear cy +C Start software pipeline + lwz r8,0(r4) + bdz L(end3) + lwzu r9,4(r4) + mullw r11,r8,r6 + mulhwu r0,r8,r6 + bdz L(end1) +C Software pipelined main loop +L(loop): + lwz r8,4(r4) + mullw r10,r9,r6 + adde r5,r11,r12 + mulhwu r12,r9,r6 + stw r5,4(r3) + bdz L(end2) + lwzu r9,8(r4) + mullw r11,r8,r6 + adde r7,r10,r0 + mulhwu r0,r8,r6 + stwu r7,8(r3) + bdnz L(loop) +C Finish software pipeline +L(end1): + mullw r10,r9,r6 + adde r5,r11,r12 + mulhwu r12,r9,r6 + stw r5,4(r3) + adde r7,r10,r0 + stwu r7,8(r3) + addze r3,r12 + blr +L(end2): + mullw r11,r8,r6 + adde r7,r10,r0 + mulhwu r0,r8,r6 + stwu r7,8(r3) + adde r5,r11,r12 + stw r5,4(r3) + addze r3,r0 + blr +L(end3): + mullw r11,r8,r6 + stw r11,4(r3) + mulhwu r3,r8,r6 + blr +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm b/gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm new file mode 100644 index 0000000..3b6685e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p3-p7/aors_n.asm @@ -0,0 +1,187 @@ +dnl PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.5 +C POWER4/PPC970 2 +C POWER5 2 +C POWER6 2.78 +C POWER7 2.15-2.87 + +C This code is based on powerpc64/aors_n.asm. + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +ifdef(`OPERATION_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + SETCBR(r7) + b L(ent) +EPILOGUE() + +PROLOGUE(func) + CLRCB +L(ent): stwu r1, -32(r1) + rlwinm. r0, r6, 0,30,31 C r0 = n & 3, set cr0 + cmpwi cr6, r0, 2 + stw r28, 8(r1) + addi r6, r6, 3 C compute count... + stw r29, 12(r1) + srwi r6, r6, 2 C ...for ctr + stw r30, 16(r1) + mtctr r6 C copy count into ctr + stw r31, 20(r1) + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): lwz r8, 0(r4) C load s1 limb + lwz r9, 0(r5) C load s2 limb + lwz r10, 4(r4) C load s1 limb + lwz r11, 4(r5) C load s2 limb + lwz r12, 8(r4) C load s1 limb + addi r4, r4, 12 + lwz r0, 8(r5) C load s2 limb + addi r5, r5, 12 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + stw r29, 0(r3) + stw r30, 4(r3) + stw r31, 8(r3) + addi r3, r3, 12 + bdnz L(go) + b L(ret) + +L(b01): lwz r12, 0(r4) C load s1 limb + addi r4, r4, 4 + lwz r0, 0(r5) C load s2 limb + addi r5, r5, 4 + ADDSUBC r31, r0, r12 C add + stw r31, 0(r3) + addi r3, r3, 4 + bdnz L(go) + b L(ret) + +L(b10): lwz r10, 0(r4) C load s1 limb + lwz r11, 0(r5) C load s2 limb + lwz r12, 4(r4) C load s1 limb + addi r4, r4, 8 + lwz r0, 4(r5) C load s2 limb + addi r5, r5, 8 + ADDSUBC r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + stw r30, 0(r3) + stw r31, 4(r3) + addi r3, r3, 8 + bdnz L(go) + b L(ret) + +L(b00): C INITCY C clear/set cy +L(go): lwz r6, 0(r4) C load s1 limb + lwz r7, 0(r5) C load s2 limb + lwz r8, 4(r4) C load s1 limb + lwz r9, 4(r5) C load s2 limb + lwz r10, 8(r4) C load s1 limb + lwz r11, 8(r5) C load s2 limb + lwz r12, 12(r4) C load s1 limb + lwz r0, 12(r5) C load s2 limb + bdz L(end) + + addi r4, r4, 16 + addi r5, r5, 16 + + ALIGN(16) +L(top): ADDSUBC r28, r7, r6 + lwz r6, 0(r4) C load s1 limb + lwz r7, 0(r5) C load s2 limb + ADDSUBC r29, r9, r8 + lwz r8, 4(r4) C load s1 limb + lwz r9, 4(r5) C load s2 limb + ADDSUBC r30, r11, r10 + lwz r10, 8(r4) C load s1 limb + lwz r11, 8(r5) C load s2 limb + ADDSUBC r31, r0, r12 + lwz r12, 12(r4) C load s1 limb + lwz r0, 12(r5) C load s2 limb + stw r28, 0(r3) + addi r4, r4, 16 + stw r29, 4(r3) + addi r5, r5, 16 + stw r30, 8(r3) + stw r31, 12(r3) + addi r3, r3, 16 + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r7, r6 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + stw r28, 0(r3) + stw r29, 4(r3) + stw r30, 8(r3) + stw r31, 12(r3) + +L(ret): + lwz r28, 8(r1) + lwz r29, 12(r1) + subfe r3, r0, r0 C -cy + lwz r30, 16(r1) + GENRVAL + lwz r31, 20(r1) + addi r1, r1, 32 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h new file mode 100644 index 0000000..3382695 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p3/gmp-mparam.h @@ -0,0 +1,155 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 450 MHz POWER3 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 38 +#define MUL_TOOM44_THRESHOLD 58 +#define MUL_TOOM6H_THRESHOLD 129 +#define MUL_TOOM8H_THRESHOLD 212 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 63 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 59 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 64 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 14 +#define SQR_TOOM3_THRESHOLD 53 +#define SQR_TOOM4_THRESHOLD 76 +#define SQR_TOOM6_THRESHOLD 106 +#define SQR_TOOM8_THRESHOLD 284 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 9, 5}, { 19, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 23, 9}, { 7, 8}, { 15, 7}, \ + { 33, 8}, { 23, 9}, { 15, 8}, { 35, 9}, \ + { 23,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 71, 8}, { 143, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 9}, { 143,10}, { 79, 9}, \ + { 159, 8}, { 319, 9}, { 175, 8}, { 351,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703, 8}, \ + { 1407,11}, { 191,10}, { 415,11}, { 223,10}, \ + { 447, 9}, { 895,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 82 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 176 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 176, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \ + { 7, 8}, { 15, 7}, { 31, 8}, { 23, 9}, \ + { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \ + { 575, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143, 8}, { 287, 7}, { 575,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111, 9}, \ + { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ + { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 223,12}, { 63,11}, { 127,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 351, 9}, \ + { 703, 8}, { 1407,11}, { 191,10}, { 383,11}, \ + { 223,10}, { 447, 9}, { 895,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 87 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 5240 + +#define DC_DIV_QR_THRESHOLD 32 +#define DC_DIVAPPR_Q_THRESHOLD 123 +#define DC_BDIV_QR_THRESHOLD 34 +#define DC_BDIV_Q_THRESHOLD 84 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 129 +#define INV_APPR_THRESHOLD 124 + +#define BINV_NEWTON_THRESHOLD 148 +#define REDC_1_TO_REDC_N_THRESHOLD 38 + +#define MU_DIV_QR_THRESHOLD 748 +#define MU_DIVAPPR_Q_THRESHOLD 748 +#define MUPI_DIV_QR_THRESHOLD 59 +#define MU_BDIV_QR_THRESHOLD 562 +#define MU_BDIV_Q_THRESHOLD 654 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD_THRESHOLD 76 +#define GCD_DC_THRESHOLD 205 +#define GCDEXT_DC_THRESHOLD 174 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 181 +#define SET_STR_PRECOMPUTE_THRESHOLD 525 diff --git a/gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h new file mode 100644 index 0000000..7ac59f5 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p4/gmp-mparam.h @@ -0,0 +1,209 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2011, 2014 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* 1800 MHz PowerPC-970 */ +/* FFT tuning limit = 10000000 */ +/* Generated by tuneup.c, 2014-03-12, gcc 4.0 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 42 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 45 + +#define DIV_1_VS_MUL_1_PERCENT 225 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 107 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 92 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 160 +#define SQR_TOOM6_THRESHOLD 197 +#define SQR_TOOM8_THRESHOLD 357 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 17, 6}, { 9, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 24, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191, 8}, { 383,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287, 8}, { 575, 9}, \ + { 303,10}, { 159, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 335, 9}, { 671, 8}, { 1343,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671, 9}, { 1343,11}, { 351,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 703,10}, { 1407,11}, { 735,12}, \ + { 383,11}, { 767,10}, { 1535,11}, { 831,12}, \ + { 447,10}, { 1791,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \ + { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,11}, { 1727,10}, { 3455,11}, { 1791,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,11}, { 3455,12}, { 1791,14}, { 511,13}, \ + { 1151,12}, { 2431,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 157 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 28, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135,10}, { 79, 9}, { 159, 8}, \ + { 319,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303, 8}, { 607,10}, { 159, 9}, \ + { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303, 9}, \ + { 607,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 543, 9}, { 1087,11}, { 287,10}, \ + { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 703,10}, { 1407,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 150 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 55 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 169 +#define SQRLO_SQR_THRESHOLD 9335 + +#define DC_DIV_QR_THRESHOLD 50 +#define DC_DIVAPPR_Q_THRESHOLD 196 +#define DC_BDIV_QR_THRESHOLD 51 +#define DC_BDIV_Q_THRESHOLD 166 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 202 + +#define BINV_NEWTON_THRESHOLD 228 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1187 +#define MU_DIVAPPR_Q_THRESHOLD 1308 +#define MUPI_DIV_QR_THRESHOLD 114 +#define MU_BDIV_QR_THRESHOLD 998 +#define MU_BDIV_Q_THRESHOLD 1142 + +#define POWM_SEC_TABLE 3,28,78,480,1099 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1002 + +#define FAC_DSC_THRESHOLD 179 +#define FAC_ODD_THRESHOLD 28 + +#define MATRIX22_STRASSEN_THRESHOLD 9 +#define HGCD_THRESHOLD 93 +#define HGCD_APPR_THRESHOLD 109 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 379 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h new file mode 100644 index 0000000..faa1e81 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p5/gmp-mparam.h @@ -0,0 +1,156 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1650 MHz POWER5 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 61 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 189 +#define MUL_TOOM8H_THRESHOLD 309 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 83 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 40 +#define SQR_TOOM3_THRESHOLD 77 +#define SQR_TOOM4_THRESHOLD 124 +#define SQR_TOOM6_THRESHOLD 140 +#define SQR_TOOM8_THRESHOLD 238 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define POWM_SEC_TABLE 4,29,252,840,2080 + +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 412, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 415, 9}, { 831,11}, { 223,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 71 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 47,10}, { 31, 9}, \ + { 71,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415,11}, { 223,10}, { 447,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 76 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 68 +#define MULLO_MUL_N_THRESHOLD 9236 + +#define DC_DIV_QR_THRESHOLD 69 +#define DC_DIVAPPR_Q_THRESHOLD 220 +#define DC_BDIV_QR_THRESHOLD 75 +#define DC_BDIV_Q_THRESHOLD 188 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 230 +#define INV_APPR_THRESHOLD 230 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_N_THRESHOLD 87 + +#define MU_DIV_QR_THRESHOLD 1210 +#define MU_DIVAPPR_Q_THRESHOLD 1308 +#define MUPI_DIV_QR_THRESHOLD 106 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1210 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD_THRESHOLD 110 +#define HGCD_APPR_THRESHOLD 138 +#define HGCD_REDUCE_THRESHOLD 2578 +#define GCD_DC_THRESHOLD 408 +#define GCDEXT_DC_THRESHOLD 298 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 527 +#define SET_STR_PRECOMPUTE_THRESHOLD 1090 diff --git a/gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h new file mode 100644 index 0000000..c9504b6 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p6/gmp-mparam.h @@ -0,0 +1,165 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3500 MHz POWER6 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 55 +#define MUL_TOOM44_THRESHOLD 88 +#define MUL_TOOM6H_THRESHOLD 137 +#define MUL_TOOM8H_THRESHOLD 181 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 56 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 57 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 56 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 189 +#define SQR_TOOM8_THRESHOLD 296 + +#define MULMID_TOOM42_THRESHOLD 26 + +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define POWM_SEC_TABLE 2,26,127,453,1068 + +#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 212, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 16, 7}, { 9, 6}, \ + { 19, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 31, 8}, { 19, 7}, { 39, 8}, { 23, 9}, \ + { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 7}, { 287, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 7}, { 511, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 175, 8}, { 351,10}, { 95, 9}, { 191, 8}, \ + { 383, 9}, { 207,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 89 +#define MUL_FFT_THRESHOLD 1728 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 6, 4}, { 13, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 23, 9}, { 7, 8}, { 23, 9}, \ + { 15, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 63, 8}, { 127, 7}, \ + { 255, 9}, { 71, 8}, { 143, 7}, { 287, 6}, \ + { 575, 9}, { 79,10}, { 47,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 143, 8}, \ + { 287, 7}, { 575,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 175, 8}, { 351,10}, { 95, 9}, \ + { 191, 8}, { 383, 9}, { 207,10}, { 111, 9}, \ + { 223,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 351, 9}, { 703, 8}, { 1407,11}, { 191,10}, \ + { 415,11}, { 223,10}, { 447, 9}, { 895,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 92 +#define SQR_FFT_THRESHOLD 1600 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 57 +#define MULLO_MUL_N_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 187 +#define DC_BDIV_QR_THRESHOLD 64 +#define DC_BDIV_Q_THRESHOLD 146 + +#define INV_MULMOD_BNM1_THRESHOLD 68 +#define INV_NEWTON_THRESHOLD 182 +#define INV_APPR_THRESHOLD 182 + +#define BINV_NEWTON_THRESHOLD 186 +#define REDC_1_TO_REDC_N_THRESHOLD 60 + +#define MU_DIV_QR_THRESHOLD 924 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 73 +#define MU_BDIV_QR_THRESHOLD 667 +#define MU_BDIV_Q_THRESHOLD 823 + +#define MATRIX22_STRASSEN_THRESHOLD 8 +#define HGCD_THRESHOLD 61 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 974 +#define GCD_DC_THRESHOLD 195 +#define GCDEXT_DC_THRESHOLD 134 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 9 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 190 +#define SET_STR_PRECOMPUTE_THRESHOLD 411 diff --git a/gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h new file mode 100644 index 0000000..ad48dac --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/p7/gmp-mparam.h @@ -0,0 +1,170 @@ +/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 4150 MHz POWER8/T4 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2017-02-18, gcc 6.1 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 2 +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD 3 +#define DIV_QR_2_PI2_THRESHOLD 15 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 39 + +#define DIV_1_VS_MUL_1_PERCENT 343 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 202 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 128 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 121 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 26 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 236 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 34 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 21, 6}, { 12, 5}, { 25, 6}, \ + { 13, 5}, { 27, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 17, 6}, \ + { 35, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351,11}, { 191,10}, { 415, 9}, \ + { 831,11}, { 223,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 70 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 332 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 332, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 47,10}, { 31, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 9}, { 135,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303, 8}, { 607,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 415,11}, { 223,10}, { 447,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 75 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 36 +#define MULLO_MUL_N_THRESHOLD 8648 +#define SQRLO_BASECASE_THRESHOLD 5 +#define SQRLO_DC_THRESHOLD 193 +#define SQRLO_SQR_THRESHOLD 6675 + +#define DC_DIV_QR_THRESHOLD 33 +#define DC_DIVAPPR_Q_THRESHOLD 134 +#define DC_BDIV_QR_THRESHOLD 51 +#define DC_BDIV_Q_THRESHOLD 134 + +#define INV_MULMOD_BNM1_THRESHOLD 66 +#define INV_NEWTON_THRESHOLD 132 +#define INV_APPR_THRESHOLD 131 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 62 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 3,25,114,480,1486 + +#define GET_STR_DC_THRESHOLD 8 +#define GET_STR_PRECOMPUTE_THRESHOLD 14 +#define SET_STR_DC_THRESHOLD 644 +#define SET_STR_PRECOMPUTE_THRESHOLD 1365 + +#define FAC_DSC_THRESHOLD 107 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 95 +#define HGCD_APPR_THRESHOLD 121 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 456 +#define GCDEXT_DC_THRESHOLD 386 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 b/gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 new file mode 100644 index 0000000..6a61278 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/powerpc-defs.m4 @@ -0,0 +1,128 @@ +divert(-1) + +dnl m4 macros for PowerPC assembler (32 and 64 bit). + +dnl Copyright 2000, 2002, 2003, 2017, 2018, 2020 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl This is the same as the default in mpn/asm-defs.m4, but with ALIGN(4) +dnl not 8. +dnl +dnl 4-byte alignment is normally enough, certainly it's what gcc gives. We +dnl don't want bigger alignment within PROLOGUE since it can introduce +dnl padding into multiple-entrypoint routines, and with gas such padding is +dnl zero words, which are not valid instructions. + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) +` TEXT + ALIGN(4) + GLOBL `$1' GLOBL_ATTR + TYPE(`$1',`function') +`$1'LABEL_SUFFIX') + + +dnl Usage: r0 ... r31, cr0 ... cr7 +dnl +dnl Registers names, either left as "r0" etc or mapped to plain 0 etc, +dnl according to the result of the GMP_ASM_POWERPC_REGISTERS configure +dnl test. + +ifelse(WANT_R_REGISTERS,no,` +forloop(i,0,31,`deflit(`r'i,i)') +forloop(i,0,31,`deflit(`v'i,i)') +forloop(i,0,31,`deflit(`f'i,i)') +forloop(i,0,7, `deflit(`cr'i,i)') +') + + +dnl Usage: ASSERT(cond,instructions) +dnl +dnl If WANT_ASSERT is 1, output the given instructions and expect the given +dnl flags condition to then be satisfied. For example, +dnl +dnl ASSERT(eq, `cmpwi r6, 123') +dnl +dnl The instructions can be omitted to just assert a flags condition with +dnl no extra calculation. For example, +dnl +dnl ASSERT(ne) +dnl +dnl The condition can be omitted to just output the given instructions when +dnl assertion checking is wanted. For example, +dnl +dnl ASSERT(, `mr r11, r0') +dnl +dnl Using a zero word for an illegal instruction is probably not ideal, +dnl since it marks the beginning of a traceback table in the 64-bit ABI. +dnl But assertions are only for development, so it doesn't matter too much. + +define(ASSERT, +m4_assert_numargs_range(1,2) +m4_assert_defined(`WANT_ASSERT') +`ifelse(WANT_ASSERT,1, + `C ASSERT + $2 +ifelse(`$1',,, +` b$1 L(ASSERT_ok`'ASSERT_counter) + W32 0 C assertion failed +L(ASSERT_ok`'ASSERT_counter): +define(`ASSERT_counter',incr(ASSERT_counter)) +')')') + +define(ASSERT_counter,1) + +dnl Manually assemble some new instructions +dnl + +define(`maddld',m4_assert_numargs(4)`dnl +.long eval(0x10000033+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,6))') + +define(`maddhdu',m4_assert_numargs(4)`dnl +.long eval(0x10000031+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,6))') + +define(`popcntd',m4_assert_numargs(2)`dnl +.long eval(0x7c0003f4+m4_lshift($2,21)+m4_lshift($1,16))') + +define(`divdeu',m4_assert_numargs(3)`dnl +.long eval(0x7c000312+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11))') + +define(`addex',m4_assert_numargs(4)`dnl +.long eval(0x7c000154+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,9))') + +define(`aese',m4_assert_numargs(3)`dnl +.long eval(0x10000508+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11))') + +define(`aeselst',m4_assert_numargs(3)`dnl +.long eval(0x10000509+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11))') + +divert diff --git a/gmp-6.3.0/mpn/powerpc32/rshift.asm b/gmp-6.3.0/mpn/powerpc32/rshift.asm new file mode 100644 index 0000000..d86cdcb --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/rshift.asm @@ -0,0 +1,166 @@ +dnl PowerPC-32 mpn_rshift -- Shift a number right. + +dnl Copyright 1995, 1998, 2000, 2002-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 3.0 +C 75x (G3): 3.0 +C 7400,7410 (G4): 3.0 +C 7445,7455 (G4+): 2.5 +C 7447,7457 (G4+): 2.25 +C power4/ppc970: 2.5 +C power5: 2.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C cnt r6 + +ASM_START() +PROLOGUE(mpn_rshift) + cmpwi cr0, r5, 30 C more than 30 limbs? + addi r7, r3, -4 C dst-4 + bgt L(BIG) C branch if more than 12 limbs + + mtctr r5 C copy size into CTR + subfic r8, r6, 32 + lwz r11, 0(r4) C load first s1 limb + slw r3, r11, r8 C compute function return value + bdz L(end1) + +L(oop): lwzu r10, 4(r4) + srw r9, r11, r6 + slw r12, r10, r8 + or r9, r9, r12 + stwu r9, 4(r7) + bdz L(end2) + lwzu r11, 4(r4) + srw r9, r10, r6 + slw r12, r11, r8 + or r9, r9, r12 + stwu r9, 4(r7) + bdnz L(oop) + +L(end1): + srw r0, r11, r6 + stw r0, 4(r7) + blr +L(end2): + srw r0, r10, r6 + stw r0, 4(r7) + blr + +L(BIG): + stwu r1, -48(r1) + stmw r24, 8(r1) C save registers we are supposed to preserve + lwz r9, 0(r4) + subfic r8, r6, 32 + slw r3, r9, r8 C compute function return value + srw r0, r9, r6 + addi r5, r5, -1 + + andi. r10, r5, 3 C count for spill loop + beq L(e) + mtctr r10 + lwzu r28, 4(r4) + bdz L(xe0) + +L(loop0): + srw r12, r28, r6 + slw r24, r28, r8 + lwzu r28, 4(r4) + or r24, r0, r24 + stwu r24, 4(r7) + mr r0, r12 + bdnz L(loop0) C taken at most once! + +L(xe0): srw r12, r28, r6 + slw r24, r28, r8 + or r24, r0, r24 + stwu r24, 4(r7) + mr r0, r12 + +L(e): srwi r5, r5, 2 C count for unrolled loop + addi r5, r5, -1 + mtctr r5 + lwz r28, 4(r4) + lwz r29, 8(r4) + lwz r30, 12(r4) + lwzu r31, 16(r4) + +L(loopU): + srw r9, r28, r6 + slw r24, r28, r8 + lwz r28, 4(r4) + srw r10, r29, r6 + slw r25, r29, r8 + lwz r29, 8(r4) + srw r11, r30, r6 + slw r26, r30, r8 + lwz r30, 12(r4) + srw r12, r31, r6 + slw r27, r31, r8 + lwzu r31, 16(r4) + or r24, r0, r24 + stw r24, 4(r7) + or r25, r9, r25 + stw r25, 8(r7) + or r26, r10, r26 + stw r26, 12(r7) + or r27, r11, r27 + stwu r27, 16(r7) + mr r0, r12 + bdnz L(loopU) + + srw r9, r28, r6 + slw r24, r28, r8 + srw r10, r29, r6 + slw r25, r29, r8 + srw r11, r30, r6 + slw r26, r30, r8 + srw r12, r31, r6 + slw r27, r31, r8 + or r24, r0, r24 + stw r24, 4(r7) + or r25, r9, r25 + stw r25, 8(r7) + or r26, r10, r26 + stw r26, 12(r7) + or r27, r11, r27 + stw r27, 16(r7) + + stw r12, 20(r7) + lmw r24, 8(r1) C restore registers + addi r1, r1, 48 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm b/gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm new file mode 100644 index 0000000..d50718e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/sec_tabselect.asm @@ -0,0 +1,143 @@ +dnl PowerPC-32 mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: ? +C 75x (G3): ? +C 7400,7410 (G4): 2.5 +C 744x,745x (G4+): 2.0 +C power4/ppc970: 2.0 +C power5: ? + +define(`rp', `r3') +define(`tp', `r4') +define(`n', `r5') +define(`nents', `r6') +define(`which', `r7') + +define(`i', `r8') +define(`j', `r9') +define(`stride', `r12') +define(`mask', `r11') + + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + stwu r1, -32(r1) + addic. j, n, -4 C outer loop induction variable + stmw r27, 8(r1) + slwi stride, n, 2 + + blt cr0, L(outer_end) +L(outer_top): + mtctr nents + mr r10, tp + li r28, 0 + li r29, 0 + li r30, 0 + li r31, 0 + addic. j, j, -4 C outer loop induction variable + mr i, which + + ALIGN(16) +L(top): addic i, i, -1 C set carry iff i != 0 + subfe mask, mask, mask + lwz r0, 0(tp) + lwz r27, 4(tp) + and r0, r0, mask + and r27, r27, mask + or r28, r28, r0 + or r29, r29, r27 + lwz r0, 8(tp) + lwz r27, 12(tp) + and r0, r0, mask + and r27, r27, mask + or r30, r30, r0 + or r31, r31, r27 + add tp, tp, stride + bdnz L(top) + + stw r28, 0(rp) + stw r29, 4(rp) + stw r30, 8(rp) + stw r31, 12(rp) + addi tp, r10, 16 + addi rp, rp, 16 + bge cr0, L(outer_top) +L(outer_end): + + andi. r0, n, 2 + beq cr0, L(b0x) +L(b1x): mtctr nents + mr r10, tp + li r28, 0 + li r29, 0 + mr i, which + ALIGN(16) +L(tp2): addic i, i, -1 + subfe mask, mask, mask + lwz r0, 0(tp) + lwz r27, 4(tp) + and r0, r0, mask + and r27, r27, mask + or r28, r28, r0 + or r29, r29, r27 + add tp, tp, stride + bdnz L(tp2) + stw r28, 0(rp) + stw r29, 4(rp) + addi tp, r10, 8 + addi rp, rp, 8 + +L(b0x): andi. r0, n, 1 + beq cr0, L(b00) +L(b01): mtctr nents + mr r10, tp + li r28, 0 + mr i, which + ALIGN(16) +L(tp1): addic i, i, -1 + subfe mask, mask, mask + lwz r0, 0(tp) + and r0, r0, mask + or r28, r28, r0 + add tp, tp, stride + bdnz L(tp1) + stw r28, 0(rp) + +L(b00): lmw r27, 8(r1) + addi r1, r1, 32 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..f7aba33 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/sqr_diag_addlsh1.asm @@ -0,0 +1,80 @@ +dnl PowerPC-32 mpn_sqr_diag_addlsh1. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e ? +C 604e ? +C 75x (G3) ? +C 7400,7410 (G4) ? +C 744x,745x (G4+) 6 +C power4/ppc970 ? +C power5 ? + +C This has been feebly optimised for 7447 but not for any other CPU. + +define(`rp', r3) +define(`tp', r4) +define(`up', r5) +define(`n', r6) + +ASM_START() +PROLOGUE(mpn_sqr_diag_addlsh1) + addi n, n, -1 + addi tp, tp, -4 + mtctr n + lwz r0, 0(up) + li r10, 0 + mullw r7, r0, r0 + stw r7, 0(rp) + mulhwu r6, r0, r0 + addic r31, r31, 0 C clear CF + + ALIGN(16) +L(top): lwzu r0, 4(up) + mullw r7, r0, r0 + lwz r8, 4(tp) + lwzu r9, 8(tp) + rlwimi r10, r8, 1,0,30 + srwi r11, r8, 31 + rlwimi r11, r9, 1,0,30 + adde r10, r10, r6 + adde r11, r11, r7 + stw r10, 4(rp) + srwi r10, r9, 31 + mulhwu r6, r0, r0 + stwu r11, 8(rp) + bdnz L(top) + + adde r10, r10, r6 + stw r10, 4(rp) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm b/gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm new file mode 100644 index 0000000..6dc6460 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/sublsh1_n.asm @@ -0,0 +1,101 @@ +dnl PowerPC-32 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) + +dnl Copyright 2003, 2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 4.0 +C 75x (G3): 5.0 +C 7400,7410 (G4): 5.0 +C 744x,745x (G4+): 5.0 +C power4/ppc970: 4.25 +C power5: 5.0 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +define(`rp',`r3') +define(`up',`r4') +define(`vp',`r5') + +define(`s0',`r6') +define(`s1',`r7') +define(`u0',`r8') +define(`v0',`r10') +define(`v1',`r11') + +ASM_START() +PROLOGUE(mpn_sublsh1_n) + mtctr r6 C copy n in ctr + + lwz v0, 0(vp) C load v limb + lwz u0, 0(up) C load u limb + addic up, up, -4 C update up; set cy + addi rp, rp, -4 C update rp + slwi s1, v0, 1 + bdz L(end) C If done, skip loop + +L(loop): + lwz v1, 4(vp) C load v limb + subfe s1, s1, u0 C add limbs with cy, set cy + srwi s0, v0, 31 C shift down previous v limb + stw s1, 4(rp) C store result limb + lwzu u0, 8(up) C load u limb and update up + rlwimi s0, v1, 1, 0,30 C left shift v limb and merge with prev v limb + + bdz L(exit) C decrement ctr and exit if done + + lwzu v0, 8(vp) C load v limb and update vp + subfe s0, s0, u0 C add limbs with cy, set cy + srwi s1, v1, 31 C shift down previous v limb + stwu s0, 8(rp) C store result limb and update rp + lwz u0, 4(up) C load u limb + rlwimi s1, v0, 1, 0,30 C left shift v limb and merge with prev v limb + + bdnz L(loop) C decrement ctr and loop back + +L(end): subfe r7, s1, u0 + srwi r4, v0, 31 + stw r7, 4(rp) C store last result limb + subfze r3, r4 + neg r3, r3 + blr +L(exit): + subfe r7, s0, u0 + srwi r4, v1, 31 + stw r7, 8(rp) C store last result limb + subfze r3, r4 + neg r3, r3 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/submul_1.asm b/gmp-6.3.0/mpn/powerpc32/submul_1.asm new file mode 100644 index 0000000..8ef37b0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/submul_1.asm @@ -0,0 +1,151 @@ +dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1995, 1997, 1998, 2000, 2002, 2005 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 603e: ? +C 604e: 7.5 +C 75x (G3): 9.3-15 +C 7400,7410 (G4): 9.3-15 +C 744x,745x (G4+): 10.5 +C power4/ppc970: 6.75 +C power5: 6.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C vl r6 + +C This is optimized for the PPC604. See addmul_1.asm for additional comments. + +ASM_START() +PROLOGUE(mpn_submul_1) + cmpwi cr0,r5,9 C more than 9 limbs? + bgt cr0,L(big) C branch if more than 9 limbs + + mtctr r5 + lwz r0,0(r4) + mullw r7,r0,r6 + mulhwu r10,r0,r6 + lwz r9,0(r3) + subfc r8,r7,r9 + addc r7,r7,r8 C invert cy (r7 is junk) + addi r3,r3,-4 + bdz L(end) +L(loop): + lwzu r0,4(r4) + stwu r8,4(r3) + mullw r8,r0,r6 + adde r7,r8,r10 + mulhwu r10,r0,r6 + lwz r9,4(r3) + addze r10,r10 + subfc r8,r7,r9 + addc r7,r7,r8 C invert cy (r7 is junk) + bdnz L(loop) +L(end): stw r8,4(r3) + addze r3,r10 + blr + +L(big): stwu r1,-16(r1) + addi r5,r5,-1 + stw r30,8(r1) + srwi r0,r5,2 + stw r31,12(r1) + mtctr r0 + + lwz r7,0(r4) + mullw r8,r7,r6 + mulhwu r0,r7,r6 + lwz r7,0(r3) + subfc r7,r8,r7 + addc r8,r8,r7 + stw r7,0(r3) + +L(loopU): + lwz r7,4(r4) + lwz r12,8(r4) + lwz r30,12(r4) + lwzu r31,16(r4) + mullw r8,r7,r6 + mullw r9,r12,r6 + mullw r10,r30,r6 + mullw r11,r31,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + adde r9,r9,r0 + mulhwu r0,r12,r6 + lwz r12,8(r3) + adde r10,r10,r0 + mulhwu r0,r30,r6 + lwz r30,12(r3) + adde r11,r11,r0 + mulhwu r0,r31,r6 + lwz r31,16(r3) + addze r0,r0 C new cy_limb + subfc r7,r8,r7 + stw r7,4(r3) + subfe r12,r9,r12 + stw r12,8(r3) + subfe r30,r10,r30 + stw r30,12(r3) + subfe r31,r11,r31 + stwu r31,16(r3) + subfe r11,r11,r11 C invert ... + addic r11,r11,1 C ... carry + bdnz L(loopU) + + andi. r31,r5,3 + mtctr r31 + beq cr0,L(endx) + +L(loopE): + lwzu r7,4(r4) + mullw r8,r7,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + addze r0,r0 C new cy_limb + subfc r7,r8,r7 + addc r8,r8,r7 + stwu r7,4(r3) + bdnz L(loopE) +L(endx): + addze r3,r0 + lwz r30,8(r1) + lwz r31,12(r1) + addi r1,r1,16 + blr +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/powerpc32/umul.asm b/gmp-6.3.0/mpn/powerpc32/umul.asm new file mode 100644 index 0000000..a5811e1 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/umul.asm @@ -0,0 +1,50 @@ +dnl PowerPC-32 umul_ppmm -- support for longlong.h + +dnl Copyright 2000, 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2); +C + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + + C r3 lowptr + C r4 m1 + C r5 m2 + + mullw r0, r4, r5 + mulhwu r9, r4, r5 + stw r0, 0(r3) + mr r3, r9 + blr + +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm b/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm new file mode 100644 index 0000000..dee7266 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm @@ -0,0 +1,203 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C 16-byte coaligned unaligned +C cycles/limb cycles/limb +C 7400,7410 (G4): 0.5 0.64 +C 744x,745x (G4+): 0.75 0.82 +C 970 (G5): 0.78 1.02 (64-bit limbs) + +C STATUS +C * Works for all sizes and alignments. + +C TODO +C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling +C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 +C c/l for 970. +C * Consider using VMX instructions also for head and tail, by using some +C read-modify-write tricks. +C * The VMX code is used from the smallest sizes it handles, but measurements +C show a large speed bump at the cutoff points. Small copying (perhaps +C using some read-modify-write technique) should be optimized. +C * Make an mpn_com based on this code. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + + +ifelse(GMP_LIMB_BITS,32,` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`us', `v4') + + +ASM_START() +PROLOGUE(mpn_copyd) + +LIMB32(`slwi. r0, n, 2 ') +LIMB64(`sldi. r0, n, 3 ') + add rp, rp, r0 + add up, up, r0 + +LIMB32(`cmpi cr7, n, 11 ') +LIMB64(`cmpdi cr7, n, 5 ') + bge cr7, L(big) + + beqlr cr0 + +C Handle small cases with plain operations + mtctr n +L(topS): +LIMB32(`lwz r0, -4(up) ') +LIMB64(`ld r0, -8(up) ') + addi up, up, -GMP_LIMB_BYTES +LIMB32(`stw r0, -4(rp) ') +LIMB64(`std r0, -8(rp) ') + addi rp, rp, -GMP_LIMB_BYTES + bdnz L(topS) + blr + +C Handle large cases with VMX operations +L(big): + addi rp, rp, -16 + addi up, up, -16 + mfspr r12, 256 + oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 + mtspr 256, r0 + +LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(rp_aligned) + + subf n, r7, n +L(top0): +LIMB32(`lwz r0, 12(up) ') +LIMB64(`ld r0, 8(up) ') + addi up, up, -GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stw r0, 12(rp) ') +LIMB64(`std r0, 8(rp) ') + addi rp, rp, -GMP_LIMB_BYTES +LIMB32(`bne L(top0) ') + +L(rp_aligned): + +LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 +LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 + +LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, -16 + + beq L(up_aligned) + + lvsl us, 0, up + + addi up, up, 16 +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(1) + lvx v0, 0, up + lvx v2, r10, up + vperm v3, v2, v0, us + stvx v3, 0, rp + addi up, up, -32 + addi rp, rp, -16 + b L(lpu) +L(1): lvx v2, 0, up + addi up, up, -16 + b L(lpu) + + ALIGN(32) +L(lpu): lvx v0, 0, up + vperm v3, v0, v2, us + stvx v3, 0, rp + lvx v2, r10, up + addi up, up, -32 + vperm v3, v2, v0, us + stvx v3, r10, rp + addi rp, rp, -32 + bdnz L(lpu) + + b L(tail) + +L(up_aligned): + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(lpa) + lvx v0, 0, up + stvx v0, 0, rp + addi up, up, -16 + addi rp, rp, -16 + b L(lpa) + + ALIGN(32) +L(lpa): lvx v0, 0, up + lvx v1, r10, up + addi up, up, -32 + nop + stvx v0, 0, rp + stvx v1, r10, rp + addi rp, rp, -32 + bdnz L(lpa) + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) +LIMB32(`li r10, 12 ') +L(top2): +LIMB32(`lwzx r0, r10, up ') +LIMB64(`ld r0, 8(up) ') +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 8(rp) ') +LIMB32(`addi r10, r10, -GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm b/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm new file mode 100644 index 0000000..992b468 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm @@ -0,0 +1,198 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C 16-byte coaligned unaligned +C cycles/limb cycles/limb +C 7400,7410 (G4): 0.5 0.64 +C 744x,745x (G4+): 0.75 0.82 +C 970 (G5): 0.78 1.02 (64-bit limbs) + +C STATUS +C * Works for all sizes and alignments. + +C TODO +C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling +C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 +C c/l for 970. +C * Consider using VMX instructions also for head and tail, by using some +C read-modify-write tricks. +C * The VMX code is used from the smallest sizes it handles, but measurements +C show a large speed bump at the cutoff points. Small copying (perhaps +C using some read-modify-write technique) should be optimized. +C * Make an mpn_com based on this code. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + + +ifelse(GMP_LIMB_BITS,32,` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`us', `v4') + + +ASM_START() +PROLOGUE(mpn_copyi) + +LIMB32(`cmpi cr7, n, 11 ') +LIMB64(`cmpdi cr7, n, 5 ') + bge cr7, L(big) + + or. r0, n, n + beqlr cr0 + +C Handle small cases with plain operations + mtctr n +L(topS): +LIMB32(`lwz r0, 0(up) ') +LIMB64(`ld r0, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`stw r0, 0(rp) ') +LIMB64(`std r0, 0(rp) ') + addi rp, rp, GMP_LIMB_BYTES + bdnz L(topS) + blr + +C Handle large cases with VMX operations +L(big): + mfspr r12, 256 + oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 + mtspr 256, r0 + +LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(rp_aligned) + + subfic r7, r7, LIMBS_PER_VR + subf n, r7, n +L(top0): +LIMB32(`lwz r0, 0(up) ') +LIMB64(`ld r0, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stw r0, 0(rp) ') +LIMB64(`std r0, 0(rp) ') + addi rp, rp, GMP_LIMB_BYTES +LIMB32(`bne L(top0) ') + +L(rp_aligned): + +LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 +LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 + +LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, 16 + + beq L(up_aligned) + + lvsl us, 0, up + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(1) + lvx v0, 0, up + lvx v2, r10, up + vperm v3, v0, v2, us + stvx v3, 0, rp + addi up, up, 32 + addi rp, rp, 16 + b L(lpu) +L(1): lvx v2, 0, up + addi up, up, 16 + b L(lpu) + + ALIGN(32) +L(lpu): lvx v0, 0, up + vperm v3, v2, v0, us + stvx v3, 0, rp + lvx v2, r10, up + addi up, up, 32 + vperm v3, v0, v2, us + stvx v3, r10, rp + addi rp, rp, 32 + bdnz L(lpu) + + addi up, up, -16 + b L(tail) + +L(up_aligned): + +LIMB32(`andi. r0, n, 0x4 ') +LIMB64(`andi. r0, n, 0x2 ') + beq L(lpa) + lvx v0, 0, up + stvx v0, 0, rp + addi up, up, 16 + addi rp, rp, 16 + b L(lpa) + + ALIGN(32) +L(lpa): lvx v0, 0, up + lvx v1, r10, up + addi up, up, 32 + nop + stvx v0, 0, rp + stvx v1, r10, rp + addi rp, rp, 32 + bdnz L(lpa) + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) +LIMB32(`li r10, 0 ') +L(top2): +LIMB32(`lwzx r0, r10, up ') +LIMB64(`ld r0, 0(up) ') +LIMB32(`addic. r7, r7, -1 ') +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm b/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm new file mode 100644 index 0000000..d656d3b --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm @@ -0,0 +1,310 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n, +dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise +dnl logical operations. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C and,ior,andn,nior,xor iorn,xnor nand +C cycles/limb cycles/limb cycles/limb +C 7400,7410 (G4): 1.39 ? ? +C 744x,745x (G4+): 1.14 1.39 1.39 +C 970: 1.7 2.0 2.0 + +C STATUS +C * Works for all sizes and alignment for 32-bit limbs. +C * Works for n >= 4 for 64-bit limbs; untested for smaller operands. +C * Current performance makes this pointless for 970 + +C TODO +C * Might want to make variants when just one of the source operands needs +C vperm, and when neither needs it. The latter runs 50% faster on 7400. +C * Idea: If the source operands are equally aligned, we could do the logops +C first, then vperm before storing! That means we never need more than one +C vperm, ever! +C * Perhaps align `rp' after initial alignment loop? +C * Instead of having scalar code in the beginning and end, consider using +C read-modify-write vector code. +C * Software pipeline? Hopefully not too important, this is hairy enough +C already. +C * At least be more clever about operand loading, i.e., load v operands before +C u operands, since v operands are sometimes negated. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + +define(`vnegb', `') C default neg-before to null +define(`vnega', `') C default neg-before to null + +ifdef(`OPERATION_and_n', +` define(`func', `mpn_and_n') + define(`logopS',`and $1,$2,$3') + define(`logop', `vand $1,$2,$3')') +ifdef(`OPERATION_andn_n', +` define(`func', `mpn_andn_n') + define(`logopS',`andc $1,$2,$3') + define(`logop', `vandc $1,$2,$3')') +ifdef(`OPERATION_nand_n', +` define(`func', `mpn_nand_n') + define(`logopS',`nand $1,$2,$3') + define(`logop', `vand $1,$2,$3') + define(`vnega', `vnor $1,$2,$2')') +ifdef(`OPERATION_ior_n', +` define(`func', `mpn_ior_n') + define(`logopS',`or $1,$2,$3') + define(`logop', `vor $1,$2,$3')') +ifdef(`OPERATION_iorn_n', +` define(`func', `mpn_iorn_n') + define(`logopS',`orc $1,$2,$3') + define(`vnegb', `vnor $1,$2,$2') + define(`logop', `vor $1,$2,$3')') +ifdef(`OPERATION_nior_n', +` define(`func', `mpn_nior_n') + define(`logopS',`nor $1,$2,$3') + define(`logop', `vnor $1,$2,$3')') +ifdef(`OPERATION_xor_n', +` define(`func', `mpn_xor_n') + define(`logopS',`xor $1,$2,$3') + define(`logop', `vxor $1,$2,$3')') +ifdef(`OPERATION_xnor_n', +` define(`func',`mpn_xnor_n') + define(`logopS',`eqv $1,$2,$3') + define(`vnegb', `vnor $1,$2,$2') + define(`logop', `vxor $1,$2,$3')') + +ifelse(GMP_LIMB_BITS,`32',` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +define(`us', `v8') +define(`vs', `v9') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + +LIMB32(`cmpwi cr0, n, 8 ') +LIMB64(`cmpdi cr0, n, 4 ') + bge L(big) + + mtctr n + +LIMB32(`lwz r8, 0(up) ') +LIMB32(`lwz r9, 0(vp) ') +LIMB32(`logopS( r0, r8, r9) ') +LIMB32(`stw r0, 0(rp) ') +LIMB32(`bdz L(endS) ') + +L(topS): +LIMB32(`lwzu r8, 4(up) ') +LIMB64(`ld r8, 0(up) ') +LIMB64(`addi up, up, GMP_LIMB_BYTES ') +LIMB32(`lwzu r9, 4(vp) ') +LIMB64(`ld r9, 0(vp) ') +LIMB64(`addi vp, vp, GMP_LIMB_BYTES ') + logopS( r0, r8, r9) +LIMB32(`stwu r0, 4(rp) ') +LIMB64(`std r0, 0(rp) ') +LIMB64(`addi rp, rp, GMP_LIMB_BYTES ') + bdnz L(topS) +L(endS): + blr + +L(big): mfspr r12, 256 + oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME + mtspr 256, r0 + +C First loop until the destination is 16-byte aligned. This will execute 0 or 1 +C times for 64-bit machines, and 0 to 3 times for 32-bit machines. + +LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4 +LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2 + beq L(aligned) + + subfic r7, r0, LIMBS_PER_VR +LIMB32(`li r10, 0 ') + subf n, r7, n +L(top0): +LIMB32(`lwz r8, 0(up) ') +LIMB64(`ld r8, 0(up) ') + addi up, up, GMP_LIMB_BYTES +LIMB32(`lwz r9, 0(vp) ') +LIMB64(`ld r9, 0(vp) ') + addi vp, vp, GMP_LIMB_BYTES +LIMB32(`addic. r7, r7, -1 ') + logopS( r0, r8, r9) +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top0) ') + + addi rp, rp, 16 C update rp, but preserve its alignment + +L(aligned): +LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n +LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n + mtctr r7 C copy n to count register + + li r10, 16 + lvsl us, 0, up + lvsl vs, 0, vp + + lvx v2, 0, up + lvx v3, 0, vp + bdnz L(gt1) + lvx v0, r10, up + lvx v1, r10, vp + vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, 4 + b L(tail) + +L(gt1): addi up, up, 16 + addi vp, vp, 16 + +L(top): lvx v0, 0, up + lvx v1, 0, vp + vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + bdz L(end) + lvx v2, r10, up + lvx v3, r10, vp + vperm v4, v0, v2, us + vperm v5, v1, v3, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, r10, rp + addi up, up, 32 + addi vp, vp, 32 + addi rp, rp, 32 + bdnz L(top) + + andi. r0, up, 15 + vxor v0, v0, v0 + beq 1f + lvx v0, 0, up +1: andi. r0, vp, 15 + vxor v1, v1, v1 + beq 1f + lvx v1, 0, vp +1: vperm v4, v2, v0, us + vperm v5, v3, v1, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, 0, rp + addi rp, rp, 4 + b L(tail) + +L(end): andi. r0, up, 15 + vxor v2, v2, v2 + beq 1f + lvx v2, r10, up +1: andi. r0, vp, 15 + vxor v3, v3, v3 + beq 1f + lvx v3, r10, vp +1: vperm v4, v0, v2, us + vperm v5, v1, v3, vs + vnegb( v5, v5) + logop( v6, v4, v5) + vnega( v6, v6) + stvx v6, r10, rp + + addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, 20 + +L(tail): +LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 +LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 + beq L(ret) + addi rp, rp, 15 +LIMB32(`rlwinm rp, rp, 0,0,27 ') +LIMB64(`rldicr rp, rp, 0,59 ') + li r10, 0 +L(top2): +LIMB32(`lwzx r8, r10, up ') +LIMB64(`ldx r8, r10, up ') +LIMB32(`lwzx r9, r10, vp ') +LIMB64(`ldx r9, r10, vp ') +LIMB32(`addic. r7, r7, -1 ') + logopS( r0, r8, r9) +LIMB32(`stwx r0, r10, rp ') +LIMB64(`std r0, 0(rp) ') +LIMB32(`addi r10, r10, GMP_LIMB_BYTES') +LIMB32(`bne L(top2) ') + +L(ret): mtspr 256, r12 + blr +EPILOGUE() + +C This works for 64-bit PowerPC, since a limb ptr can only be aligned +C in 2 relevant ways, which means we can always find a pair of aligned +C pointers of rp, up, and vp. +C process words until rp is 16-byte aligned +C if (((up | vp) & 15) == 0) +C process with VMX without any vperm +C else if ((up & 15) != 0 && (vp & 15) != 0) +C process with VMX using vperm on store data +C else if ((up & 15) != 0) +C process with VMX using vperm on up data +C else +C process with VMX using vperm on vp data +C +C rlwinm, r0, up, 0,28,31 +C rlwinm r0, vp, 0,28,31 +C cmpwi cr7, r0, 0 +C cror cr6, cr0, cr7 +C crand cr0, cr0, cr7 diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm b/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm new file mode 100644 index 0000000..2bb11cd --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm @@ -0,0 +1,388 @@ +dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. + +dnl Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C cycles/limb +C 603e: - +C 604e: - +C 75x (G3): - +C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75 +C 744x,745x (G4+): 0.75 +C ppc970: 0.75 +C power4: - +C power5: - + +C TODO +C * Either start using the low-end masking constants, or remove them. +C * Merge multiple feed-in cases into a parameterized code block. +C * Reduce register usage. It should be possible to almost halve it. + +define(`up', `r3') +define(`n', `r4') + +define(`a0', `v3') +define(`a1', `v4') +define(`a2', `v5') +define(`c0', `v6') +define(`c1', `v7') +define(`c2', `v8') +define(`z', `v9') +define(`x0', `v10') +define(`x1', `v11') +define(`x2', `v12') +define(`x3', `v13') +define(`pv', `v14') +define(`y0', `v0') +define(`y1', `v1') +define(`y2', `v2') +define(`y3', `v15') + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + cmpwi cr0, n, 20 C tuned cutoff point + bge L(large) + + li r9, 0 C result accumulator + mulli r10, n, 0xb C 0xb = ceil(32/3) + srwi. r10, r10, 5 C r10 = floor(n/3), n < 32 + beq L(small_tail) + mtctr r10 + lwz r6, 0(up) + lwz r7, 4(up) + lwzu r8, 8(up) + subf n, r10, n + subf n, r10, n + subf n, r10, n + bdz L(small_end) + + ALIGN(16) +L(los): rlwinm r0, r6, 0,8,31 + add r9, r9, r0 C add 24b from u0 + srwi r0, r6, 24 + lwz r6, 4(up) + rlwimi r0, r7, 8, 0x00ffff00 C --111100 + add r9, r9, r0 C add 8b from u0 and 16b from u1 + srwi r0, r7, 16 + lwz r7, 8(up) + rlwimi r0, r8, 16, 0x00ff0000 C --221111 + add r9, r9, r0 C add 16b from u1 and 8b from u2 + srwi r0, r8, 8 C --222222 + lwzu r8, 12(up) + add r9, r9, r0 C add 24b from u2 + bdnz L(los) +L(small_end): + rlwinm r0, r6, 0,8,31 + add r9, r9, r0 C add 24b from u0 + srwi r0, r6, 24 + rlwimi r0, r7, 8, 0x00ffff00 C --111100 + add r9, r9, r0 C add 8b from u0 and 16b from u1 + srwi r0, r7, 16 + rlwimi r0, r8, 16, 0x00ff0000 C --221111 + add r9, r9, r0 C add 16b from u1 and 8b from u2 + srwi r0, r8, 8 C --222222 + add r9, r9, r0 C add 24b from u2 + + addi up, up, 4 + rlwinm r0, r9, 0,8,31 + srwi r9, r9, 24 + add r9, r9, r0 + +L(small_tail): + cmpi cr0, n, 1 + blt L(ret) + + lwz r6, 0(up) + rlwinm r0, r6, 0,8,31 + srwi r6, r6, 24 + add r9, r9, r0 + add r9, r9, r6 + + beq L(ret) + + lwz r6, 4(up) + rlwinm r0, r6, 8,8,23 + srwi r6, r6, 16 + add r9, r9, r0 + add r9, r9, r6 + +L(ret): mr r3, r9 + blr + + +L(large): + stwu r1, -32(r1) + mfspr r10, 256 + oris r0, r10, 0xffff C Set VRSAVE bit 0-15 + mtspr 256, r0 + + andi. r7, up, 15 + vxor a0, v0, v0 + lis r9, 0xaaaa + vxor a1, v0, v0 + ori r9, r9, 0xaaab + vxor a2, v0, v0 + li r5, 16 + vxor c0, v0, v0 + li r6, 32 + vxor c1, v0, v0 + LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin + vxor c2, v0, v0 + vxor z, v0, v0 + + beq L(aligned16) + + cmpwi cr7, r7, 8 + bge cr7, L(na4) + + lvx a2, 0, up + addi up, up, 16 + vsldoi a2, a2, z, 4 + vsldoi a2, z, a2, 12 + + addi n, n, 9 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(2) + +L(na4): bne cr7, L(na8) + + lvx a1, 0, up + addi up, up, -16 + vsldoi a1, a1, z, 8 + vsldoi a1, z, a1, 8 + + addi n, n, 6 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(1) + +L(na8): + lvx a0, 0, up + vsldoi a0, a0, z, 12 + vsldoi a0, z, a0, 4 + + addi n, n, 3 + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + b L(0) + +L(aligned16): + mulhwu r0, n, r9 + srwi r0, r0, 3 C r0 = floor(n/12) + mtctr r0 + + mulli r8, r0, 12 + subf n, r8, n + + lvx a0, 0, up +L(0): lvx a1, r5, up +L(1): lvx a2, r6, up + addi up, up, 48 +L(2): bdz L(end) + li r12, 256 + li r9, 288 + ALIGN(32) +L(top): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + + lvx v2, r6, up + dcbt up, r12 + dcbt up, r9 + addi up, up, 48 + vaddcuw v10, a2, v2 + vadduwm a2, a2, v2 + vadduwm c2, c2, v10 + bdnz L(top) + +L(end): +C n = 0...11 + cmpwi cr0, n, 0 + beq L(sum) + cmpwi cr0, n, 4 + ble L(tail.1..4) + cmpwi cr0, n, 8 + ble L(tail.5..8) + +L(tail.9..11): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + + lvx v2, r6, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v2, v2, v11 + + vaddcuw v10, a2, v2 + vadduwm a2, a2, v2 + vadduwm c2, c2, v10 + b L(sum) + +L(tail.5..8): + lvx v0, 0, up + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + + lvx v1, r5, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v1, v1, v11 + + vaddcuw v10, a1, v1 + vadduwm a1, a1, v1 + vadduwm c1, c1, v10 + b L(sum) + +L(tail.1..4): + lvx v0, 0, up + + addi r8, r11, 96 + rlwinm r3, n ,4,26,27 + lvx v11, r3, r8 + vand v0, v0, v11 + + vaddcuw v10, a0, v0 + vadduwm a0, a0, v0 + vadduwm c0, c0, v10 + +L(sum): lvx pv, 0, r11 + vperm x0, a0, z, pv C extract 4 24-bit field from a0 + vperm y0, c2, z, pv + lvx pv, r5, r11 + vperm x1, a1, z, pv C extract 4 24-bit field from a1 + vperm y1, c0, z, pv C extract 4 24-bit field from a1 + lvx pv, r6, r11 + vperm x2, a2, z, pv C extract 4 24-bit field from a1 + vperm y2, c1, z, pv C extract 4 24-bit field from a1 + li r10, 48 + lvx pv, r10, r11 + vperm x3, a0, z, pv C extract remaining/partial a0 fields + vperm y3, c2, z, pv C extract remaining/partial a0 fields + li r10, 64 + lvx pv, r10, r11 + vperm x3, a1, x3, pv C insert remaining/partial a1 fields + vperm y3, c0, y3, pv C insert remaining/partial a1 fields + li r10, 80 + lvx pv, r10, r11 + vperm x3, a2, x3, pv C insert remaining/partial a2 fields + vperm y3, c1, y3, pv C insert remaining/partial a2 fields + +C We now have 4 128-bit accumulators to sum + vadduwm x0, x0, x1 + vadduwm x2, x2, x3 + vadduwm x0, x0, x2 + + vadduwm y0, y0, y1 + vadduwm y2, y2, y3 + vadduwm y0, y0, y2 + + vadduwm x0, x0, y0 + +C Reduce 32-bit fields + vsumsws x0, x0, z + + li r7, 16 + stvx x0, r7, r1 + lwz r3, 28(r1) + + mtspr 256, r10 + addi r1, r1, 32 + blr +EPILOGUE() + +C load | v0 | v1 | v2 | +C acc | a0 | a1 | a2 | +C carry | c0 | c1 | c2 | +C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128 +C |---|---|---|---|---|---|---|---|---|---|---|---| 32 +C | | | | | | | | | | | | | | | | | 24 +C | | | | | | | | | 48 + +C $---------------$---------------$---------------$---------------$ +C | . . . . . . . . . . . . . . . | +C |_______________________________________________________________| +C | | | | | | | +C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16-> + + +DEF_OBJECT(cnsts,16) +C Permutation vectors in the order they are used above +C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f + .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0 + .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1 + .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2 + .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0 + .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1 + .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2 +C Masks for high end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number +C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(cnsts) diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm b/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm new file mode 100644 index 0000000..943c92d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm @@ -0,0 +1,34 @@ +dnl PowerPC-32/VMX mpn_popcount. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`powerpc64/vmx/popcount.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/README b/gmp-6.3.0/mpn/powerpc64/README new file mode 100644 index 0000000..50dd399 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/README @@ -0,0 +1,166 @@ +Copyright 1999-2001, 2003-2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + POWERPC-64 MPN SUBROUTINES + + +This directory contains mpn functions for 64-bit PowerPC chips. + + +CODE ORGANIZATION + + mpn/powerpc64 mode-neutral code + mpn/powerpc64/mode32 code for mode32 + mpn/powerpc64/mode64 code for mode64 + + +The mode32 and mode64 sub-directories contain code which is for use in the +respective chip mode, 32 or 64. The top-level directory is code that's +unaffected by the mode. + +The "adde" instruction is the main difference between mode32 and mode64. It +operates on either on a 32-bit or 64-bit quantity according to the chip mode. +Other instructions have an operand size in their opcode and hence don't vary. + + + +POWER3/PPC630 pipeline information: + +Decoding is 4-way + branch and issue is 8-way with some out-of-order +capability. + +Functional units: +LS1 - ld/st unit 1 +LS2 - ld/st unit 2 +FXU1 - integer unit 1, handles any simple integer instruction +FXU2 - integer unit 2, handles any simple integer instruction +FXU3 - integer unit 3, handles integer multiply and divide +FPU1 - floating-point unit 1 +FPU2 - floating-point unit 2 + +Memory: Any two memory operations can issue, but memory subsystem + can sustain just one store per cycle. No need for data + prefetch; the hardware has very sophisticated prefetch logic. +Simple integer: 2 operations (such as add, rl*) +Integer multiply: 1 operation every 9th cycle worst case; exact timing depends + on 2nd operand's most significant bit position (10 bits per + cycle). Multiply unit is not pipelined, only one multiply + operation in progress is allowed. +Integer divide: ? +Floating-point: Any plain 2 arithmetic instructions (such as fmul, fadd, and + fmadd), latency 4 cycles. +Floating-point divide: + ? +Floating-point square root: + ? + +POWER3/PPC630 best possible times for the main loops: +shift: 1.5 cycles limited by integer unit contention. + With 63 special loops, one for each shift count, we could + reduce the needed integer instructions to 2, which would + reduce the best possible time to 1 cycle. +add/sub: 1.5 cycles, limited by ld/st unit contention. +mul: 18 cycles (average) unless floating-point operations are used, + but that would only help for multiplies of perhaps 10 and more + limbs. +addmul/submul:Same situation as for mul. + + +POWER4/PPC970 and POWER5 pipeline information: + +This is a very odd pipeline, it is basically a VLIW masquerading as a plain +architecture. Its issue rules are not made public, and since it is so weird, +it is very hard to figure out any useful information from experimentation. +An example: + + A well-aligned loop with nop's take 3, 4, 6, 7, ... cycles. + 3 cycles for 0, 1, 2, 3, 4, 5, 6, 7 nop's + 4 cycles for 8, 9, 10, 11, 12, 13, 14, 15 nop's + 6 cycles for 16, 17, 18, 19, 20, 21, 22, 23 nop's + 7 cycles for 24, 25, 26, 27 nop's + 8 cycles for 28, 29, 30, 31 nop's + ... continues regularly + + +Functional units: +LS1 - ld/st unit 1 +LS2 - ld/st unit 2 +FXU1 - integer unit 1, handles any integer instruction +FXU2 - integer unit 2, handles any integer instruction +FPU1 - floating-point unit 1 +FPU2 - floating-point unit 2 + +While this is one integer unit less than POWER3/PPC630, the remaining units +are more powerful; here they handle multiply and divide. + +Memory: 2 ld/st. Stores go to the L2 cache, which can sustain just + one store per cycle. + L1 load latency: to gregs 3-4 cycles, to fregs 5-6 cycles. + Operations that modify the address register might be split + to use also an integer issue slot. +Simple integer: 2 operations every cycle, latency 2. +Integer multiply: 2 operations every 6th cycle, latency 7 cycles. +Integer divide: ? +Floating-point: Any plain 2 arithmetic instructions (such as fmul, fadd, and + fmadd), latency 6 cycles. +Floating-point divide: + ? +Floating-point square root: + ? + + +IDEAS + +*mul_1: Handling one limb using mulld/mulhdu and two limbs using floating- +point operations should give performance of about 20 cycles for 3 limbs, or 7 +cycles/limb. + +We should probably split the single-limb operand in 32-bit chunks, and the +multi-limb operand in 16-bit chunks, allowing us to accumulate well in fp +registers. + +Problem is to get 32-bit or 16-bit words to the fp registers. Only 64-bit fp +memops copies bits without fiddling with them. We might therefore need to +load to integer registers with zero extension, store as 64 bits into temp +space, and then load to fp regs. Alternatively, load directly to fp space +and add well-chosen constants to get cancellation. (Other part after given by +subsequent subtraction.) + +Possible code mix for load-via-intregs variant: + +lwz,std,lfd +fmadd,fmadd,fmul,fmul +fctidz,stfd,ld,fctidz,stfd,ld +add,adde +lwz,std,lfd +fmadd,fmadd,fmul,fmul +fctidz,stfd,ld,fctidz,stfd,ld +add,adde +srd,sld,add,adde,add,adde diff --git a/gmp-6.3.0/mpn/powerpc64/aix.m4 b/gmp-6.3.0/mpn/powerpc64/aix.m4 new file mode 100644 index 0000000..04378b8 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/aix.m4 @@ -0,0 +1,99 @@ +divert(-1) +dnl m4 macros for AIX 64-bit assembly. + +dnl Copyright 2000-2002, 2005, 2006, 2010, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`AIX') + +define(`ASM_START', + `.machine "any" + .toc') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl Don't want ELF style .size in the epilogue. + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',toc,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl + .globl $1 + .globl .$1 + .csect [DS], 3 +$1: + .llong .$1, TOC[tc0], 0 + .csect .$1[PR], 6 +.$1:') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +`') + +define(`TOC_ENTRY', `') + +define(`LEA', +m4_assert_numargs(2) +`define(`TOC_ENTRY', +` .toc +..$2: .tc $2[TC], $2')' + `ld $1, ..$2(2)') + +define(`LEAL', +m4_assert_numargs(2) +`LEA($1,$2)') + + +define(`EXTERN', +m4_assert_numargs(1) +` .globl $1') + +define(`EXTERN_FUNC', +m4_assert_numargs(1) +` .globl .$1') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` .csect [RO], 3 + ALIGN(ifelse($#,1,2,$2)) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1)) + +define(`CALL', + `bl .$1 + nop') + +define(`ASM_END', `TOC_ENTRY') + +undefine(`EXTRA_REGISTER') + +divert diff --git a/gmp-6.3.0/mpn/powerpc64/com.asm b/gmp-6.3.0/mpn/powerpc64/com.asm new file mode 100644 index 0000000..074b7ff --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/com.asm @@ -0,0 +1,136 @@ +dnl PowerPC-64 mpn_com. + +dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 1.25 +C POWER5 ? +C POWER6 1.32 +C POWER7 1.13 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +ASM_START() +PROLOGUE(mpn_com) + +ifdef(`HAVE_ABI_mode32', +` rldicl n, n, 0,32') + + cmpdi cr0, n, 4 + blt L(sml) + + addi r10, n, 4 + srdi r10, r10, 3 + mtctr r10 + + andi. r0, n, 1 + rlwinm r11, n, 0,30,30 + rlwinm r12, n, 0,29,29 + cmpdi cr6, r11, 0 + cmpdi cr7, r12, 0 + + beq cr0, L(xx0) +L(xx1): ld r6, 0(up) + addi up, up, 8 + nor r6, r6, r6 + std r6, 0(rp) + addi rp, rp, 8 + +L(xx0): bne cr6, L(x10) +L(x00): ld r6, 0(r4) + ld r7, 8(r4) + bne cr7, L(100) +L(000): addi rp, rp, -32 + b L(lo0) +L(100): addi up, up, -32 + b L(lo4) +L(x10): ld r8, 0(r4) + ld r9, 8(r4) + bne cr7, L(110) +L(010): addi up, up, 16 + addi rp, rp, -16 + b L(lo2) +L(110): addi up, up, -16 + addi rp, rp, -48 + b L(lo6) + +L(sml): mtctr n +L(t): ld r6, 0(up) + addi up, up, 8 + nor r6, r6, r6 + std r6, 0(rp) + addi rp, rp, 8 + bdnz L(t) + blr + + ALIGN(32) +L(top): nor r6, r6, r6 + nor r7, r7, r7 + std r6, 0(rp) + std r7, 8(rp) +L(lo2): ld r6, 0(up) + ld r7, 8(up) + nor r8, r8, r8 + nor r9, r9, r9 + std r8, 16(rp) + std r9, 24(rp) +L(lo0): ld r8, 16(up) + ld r9, 24(up) + nor r6, r6, r6 + nor r7, r7, r7 + std r6, 32(rp) + std r7, 40(rp) +L(lo6): ld r6, 32(up) + ld r7, 40(up) + nor r8, r8, r8 + nor r9, r9, r9 + std r8, 48(rp) + std r9, 56(rp) + addi rp, rp, 64 +L(lo4): ld r8, 48(up) + ld r9, 56(up) + addi up, up, 64 + bdnz L(top) + +L(end): nor r6, r6, r6 + nor r7, r7, r7 + std r6, 0(rp) + std r7, 8(rp) + nor r8, r8, r8 + nor r9, r9, r9 + std r8, 16(rp) + std r9, 24(rp) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/copyd.asm b/gmp-6.3.0/mpn/powerpc64/copyd.asm new file mode 100644 index 0000000..c6ce930 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/copyd.asm @@ -0,0 +1,84 @@ +dnl PowerPC-64 mpn_copyd + +dnl Copyright 2004, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1 +C POWER4/PPC970 1 +C POWER5 ? +C POWER6 ? +C POWER7 1.4 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 + +ASM_START() +PROLOGUE(mpn_copyd) + rldic. r0, r5, 3, 59 C r0 = (r5 & 3) << 3; cr0 = (n == 4t)? + cmpldi cr6, r0, 16 C cr6 = (n cmp 4t + 2)? + +ifdef(`HAVE_ABI_mode32', +` rldic r6, r5, 3, 32', C byte count corresponding to n +` rldicr r6, r5, 3, 60') C byte count corresponding to n + + addi r5, r5, 4 C compute... +ifdef(`HAVE_ABI_mode32', +` rldicl r5, r5, 62,34', C ...branch count +` rldicl r5, r5, 62, 2') C ...branch count + mtctr r5 + + add r4, r4, r6 + add r3, r3, r6 + sub r4, r4, r0 C offset up + sub r3, r3, r0 C offset rp + + beq cr0, L(L00) + blt cr6, L(L01) + beq cr6, L(L10) + b L(L11) + + ALIGN(16) +L(oop): ld r6, 24(r4) + std r6, 24(r3) +L(L11): ld r6, 16(r4) + std r6, 16(r3) +L(L10): ld r6, 8(r4) + std r6, 8(r3) +L(L01): ld r6, 0(r4) + std r6, 0(r3) +L(L00): addi r4, r4, -32 + addi r3, r3, -32 + bdnz L(oop) + + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/copyi.asm b/gmp-6.3.0/mpn/powerpc64/copyi.asm new file mode 100644 index 0000000..9a86cb2 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/copyi.asm @@ -0,0 +1,78 @@ +dnl PowerPC-64 mpn_copyi. + +dnl Copyright 2004, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1 +C POWER4/PPC970 1 +C POWER5 ? +C POWER6 ? +C POWER7 1.4 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 + +ASM_START() +PROLOGUE(mpn_copyi) + rldic. r0, r5, 3, 59 C r0 = (r5 & 3) << 3; cr0 = (n == 4t)? + cmpldi cr6, r0, 16 C cr6 = (n cmp 4t + 2)? + + addi r5, r5, 4 C compute... +ifdef(`HAVE_ABI_mode32', +` rldicl r5, r5, 62,34', C ...branch count +` rldicl r5, r5, 62, 2') C ...branch count + mtctr r5 + + add r4, r4, r0 C offset up + add r3, r3, r0 C offset rp + + beq cr0, L(L00) + blt cr6, L(L01) + beq cr6, L(L10) + b L(L11) + + ALIGN(16) +L(oop): ld r6, -32(r4) + std r6, -32(r3) +L(L11): ld r6, -24(r4) + std r6, -24(r3) +L(L10): ld r6, -16(r4) + std r6, -16(r3) +L(L01): ld r6, -8(r4) + std r6, -8(r3) +L(L00): addi r4, r4, 32 + addi r3, r3, 32 + bdnz L(oop) + + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/darwin.m4 b/gmp-6.3.0/mpn/powerpc64/darwin.m4 new file mode 100644 index 0000000..2c995e7 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/darwin.m4 @@ -0,0 +1,122 @@ +divert(-1) +dnl m4 macros for Mac OS 64-bit assembly. + +dnl Copyright 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`DARWIN') + +define(`ASM_START',`') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',toc,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl + .text + .globl $1 + .align 5 +$1:') + +define(`lea_list', `') + +dnl LEAL -- Load Effective Address Local. This is to be used for symbols +dnl defined in the same file. It will not work for externally defined +dnl symbols. + +define(`LEAL', +m4_assert_numargs(2) +`ifdef(`PIC', +` + mflr r0 C save return address + bcl 20, 31, 1f +1: mflr $1 + addis $1, $1, ha16($2-1b) + la $1, lo16($2-1b)($1) + mtlr r0 C restore return address +',` + lis $1, ha16($2) + la $1, lo16($2)($1) +')') + +dnl LEA -- Load Effective Address. This is to be used for symbols defined in +dnl another file. It will not work for locally defined symbols. + +define(`LEA', +m4_assert_numargs(2) +`ifdef(`PIC', +`define(`lea_list', +` .non_lazy_symbol_pointer +`L'$2`'$non_lazy_ptr: + .indirect_symbol $2 + .quad 0 +') + mflr r0 C save return address + bcl 20, 31, 1f +1: mflr $1 + addis $1, $1, ha16(`L'$2`'$non_lazy_ptr-1b) + ld $1, lo16(`L'$2`'$non_lazy_ptr-1b)($1) + mtlr r0 C restore return address +',` + lis $1, ha16($2) + la $1, lo16($2)($1) +')') + +define(`EXTERN', +m4_assert_numargs(1) +`dnl') + +define(`EXTERN_FUNC', +m4_assert_numargs(1) +`dnl') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` .const + ALIGN(ifelse($#,1,2,$2)) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1)) + +define(`CALL', + `bl GSYM_PREFIX`'$1') + +define(`EPILOGUE_cpu', +`lea_list' +`define(`lea_list', `')') + +define(`ASM_END', `dnl') + +define(`EXTRA_REGISTER', r2) + +divert diff --git a/gmp-6.3.0/mpn/powerpc64/elf.m4 b/gmp-6.3.0/mpn/powerpc64/elf.m4 new file mode 100644 index 0000000..ddb5a8e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/elf.m4 @@ -0,0 +1,123 @@ +divert(-1) +dnl m4 macros for powerpc64 GNU/Linux assembly. + +dnl Copyright 2003, 2005, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`ASM_START', +`ifdef(`ELFv2_ABI', +` + .abiversion 2 +')') + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc]) +dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) +dnl + +define(`PROLOGUE_cpu', +m4_assert_numargs_range(1,2) +`ifelse(`$2',toc,, +`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl +ifdef(`ELFv2_ABI', +` + .globl $1 + .type $1, @function + .section ".text" + .align 5 +$1: +ifelse(`$2',toc,` +0: addis 2, 12, (.TOC.-0b)@ha + addi 2, 2, (.TOC.-0b)@l + .localentry $1, .-$1 +',) +',` + .globl $1 + .globl .$1 + .section ".opd","aw" + .align 3 +$1: + .llong .$1, .TOC.@tocbase, 0 + .size $1, 24 + .type .$1, @function + .section ".text" + .align 5 +.$1: +')') + +define(`EPILOGUE_cpu', +m4_assert_numargs(1) +`ifdef(`ELFv2_ABI',` + .size $1, .-$1 +',` + .size .$1, .-.$1 +')') + +define(`TOC_ENTRY', `') + +define(`LEA', +m4_assert_numargs(2) +`define(`TOC_ENTRY', +` .section ".toc", "aw" +..$2: .tc $2[TC], $2')' + `ld $1, ..$2@toc(2)') + +define(`LEAL', +m4_assert_numargs(2) +`LEA($1,$2)') + + +define(`EXTERN', +m4_assert_numargs(1) +`dnl') + +define(`EXTERN_FUNC', +m4_assert_numargs(1) +`dnl') + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) +` + .section .rodata + ALIGN(ifelse($#,1,2,$2)) + .type $1, @object +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1) +` .size $1, .-$1') + +define(`CALL', + `bl GSYM_PREFIX`'$1 + nop') + +define(`ASM_END', `TOC_ENTRY') + +undefine(`EXTRA_REGISTER') + +divert diff --git a/gmp-6.3.0/mpn/powerpc64/logops_n.asm b/gmp-6.3.0/mpn/powerpc64/logops_n.asm new file mode 100644 index 0000000..2fa6985 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/logops_n.asm @@ -0,0 +1,151 @@ +dnl PowerPC-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, +dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. + +dnl Copyright 2003-2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.75 +C POWER4/PPC970 2.10 +C POWER5 ? +C POWER6 ? +C POWER7 1.75 + +C n POWER3/PPC630 POWER4/PPC970 +C 1 15.00 15.33 +C 2 7.50 7.99 +C 3 5.33 6.00 +C 4 4.50 4.74 +C 5 4.20 4.39 +C 6 3.50 3.99 +C 7 3.14 3.64 +C 8 3.00 3.36 +C 9 3.00 3.36 +C 10 2.70 3.25 +C 11 2.63 3.11 +C 12 2.58 3.00 +C 13 2.61 3.02 +C 14 2.42 2.82 +C 15 2.40 2.79 +C 50 2.08 2.67 +C 100 1.85 2.31 +C 200 1.80 2.18 +C 400 1.77 2.14 +C 1000 1.76 2.10# +C 2000 1.75# 2.13 +C 4000 2.30 2.57 +C 8000 2.62 2.58 +C 16000 2.52 4.25 +C 32000 2.49 16.25 +C 64000 2.66 18.76 + +ifdef(`OPERATION_and_n', +` define(`func',`mpn_and_n') + define(`logop', `and')') +ifdef(`OPERATION_andn_n', +` define(`func',`mpn_andn_n') + define(`logop', `andc')') +ifdef(`OPERATION_nand_n', +` define(`func',`mpn_nand_n') + define(`logop', `nand')') +ifdef(`OPERATION_ior_n', +` define(`func',`mpn_ior_n') + define(`logop', `or')') +ifdef(`OPERATION_iorn_n', +` define(`func',`mpn_iorn_n') + define(`logop', `orc')') +ifdef(`OPERATION_nior_n', +` define(`func',`mpn_nior_n') + define(`logop', `nor')') +ifdef(`OPERATION_xor_n', +` define(`func',`mpn_xor_n') + define(`logop', `xor')') +ifdef(`OPERATION_xnor_n', +` define(`func',`mpn_xnor_n') + define(`logop', `eqv')') + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + ld r8, 0(r4) C read lowest u limb + ld r9, 0(r5) C read lowest v limb + addi r6, r6, 3 C compute branch count (1) + rldic. r0, r6, 3, 59 C r0 = (n-1 & 3) << 3; cr0 = (n == 4(t+1))? + cmpldi cr6, r0, 16 C cr6 = (n cmp 4t + 3) + +ifdef(`HAVE_ABI_mode32', +` rldicl r6, r6, 62,34', C ...branch count +` rldicl r6, r6, 62, 2') C ...branch count + mtctr r6 + + ld r6, 0(r4) C read lowest u limb (again) + ld r7, 0(r5) C read lowest v limb (again) + + add r5, r5, r0 C offset vp + add r4, r4, r0 C offset up + add r3, r3, r0 C offset rp + + beq cr0, L(L01) + blt cr6, L(L10) + beq cr6, L(L11) + b L(L00) + +L(oop): ld r8, -24(r4) + ld r9, -24(r5) + logop r10, r6, r7 + std r10, -32(r3) +L(L00): ld r6, -16(r4) + ld r7, -16(r5) + logop r10, r8, r9 + std r10, -24(r3) +L(L11): ld r8, -8(r4) + ld r9, -8(r5) + logop r10, r6, r7 + std r10, -16(r3) +L(L10): ld r6, 0(r4) + ld r7, 0(r5) + logop r10, r8, r9 + std r10, -8(r3) +L(L01): addi r5, r5, 32 + addi r4, r4, 32 + addi r3, r3, 32 + bdnz L(oop) + + logop r10, r6, r7 + std r10, -32(r3) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/lshift.asm b/gmp-6.3.0/mpn/powerpc64/lshift.asm new file mode 100644 index 0000000..880944a --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/lshift.asm @@ -0,0 +1,207 @@ +dnl PowerPC-64 mpn_lshift -- rp[] = up[] << cnt + +dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.75 +C POWER7 2.15 + +C TODO +C * Try to reduce the number of needed live registers +C * Micro-optimise header code +C * Keep in synch with rshift.asm and lshiftc.asm + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cnt', `r6') + +define(`tnc',`r0') +define(`u0',`r30') +define(`u1',`r31') +define(`retval',`r5') + +ASM_START() +PROLOGUE(mpn_lshift) + std r31, -8(r1) + std r30, -16(r1) + subfic tnc, cnt, 64 + sldi r7, n, 3 C byte count corresponding to n + add up, up, r7 C up = up + n + add rp, rp, r7 C rp = rp + n + rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 + cmpdi cr6, r30, 2 + addi r31, n, 3 C compute count... + ld r10, -8(up) C load 1st limb for b00...b11 + srd retval, r10, tnc +ifdef(`HAVE_ABI_mode32', +` rldicl r31, r31, 62,34', C ...branch count +` srdi r31, r31, 2') C ...for ctr + mtctr r31 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + ld r11, -16(up) C load 2nd limb for b10 and b11 + beq cr6, L(b10) + + ALIGN(16) +L(b11): sld r8, r10, cnt + srd r9, r11, tnc + ld u1, -24(up) + addi up, up, -24 + sld r12, r11, cnt + srd r7, u1, tnc + addi rp, rp, 16 + bdnz L(gt3) + + or r11, r8, r9 + sld r8, u1, cnt + b L(cj3) + + ALIGN(16) +L(gt3): ld u0, -8(up) + or r11, r8, r9 + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -16(up) + or r10, r12, r7 + b L(L11) + + ALIGN(32) +L(b10): sld r12, r10, cnt + addi rp, rp, 24 + srd r7, r11, tnc + bdnz L(gt2) + + sld r8, r11, cnt + or r10, r12, r7 + b L(cj2) + +L(gt2): ld u0, -24(up) + sld r8, r11, cnt + srd r9, u0, tnc + ld u1, -32(up) + or r10, r12, r7 + sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -40(up) + or r11, r8, r9 + addi up, up, -16 + b L(L10) + + ALIGN(16) +L(b00): ld u1, -16(up) + sld r12, r10, cnt + srd r7, u1, tnc + ld u0, -24(up) + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -32(up) + or r10, r12, r7 + sld r12, u0, cnt + srd r7, u1, tnc + addi rp, rp, 8 + bdz L(cj4) + +L(gt4): addi up, up, -32 + ld u0, -8(up) + or r11, r8, r9 + b L(L00) + + ALIGN(16) +L(b01): bdnz L(gt1) + sld r8, r10, cnt + std r8, -8(rp) + b L(ret) + +L(gt1): ld u0, -16(up) + sld r8, r10, cnt + srd r9, u0, tnc + ld u1, -24(up) + sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -32(up) + or r11, r8, r9 + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -40(up) + addi up, up, -40 + or r10, r12, r7 + bdz L(end) + + ALIGN(32) +L(top): sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -8(up) + std r11, -8(rp) + or r11, r8, r9 +L(L00): sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -16(up) + std r10, -16(rp) + or r10, r12, r7 +L(L11): sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -24(up) + std r11, -24(rp) + or r11, r8, r9 +L(L10): sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -32(up) + addi up, up, -32 + std r10, -32(rp) + addi rp, rp, -32 + or r10, r12, r7 + bdnz L(top) + + ALIGN(32) +L(end): sld r12, u0, cnt + srd r7, u1, tnc + std r11, -8(rp) +L(cj4): or r11, r8, r9 + sld r8, u1, cnt + std r10, -16(rp) +L(cj3): or r10, r12, r7 + std r11, -24(rp) +L(cj2): std r10, -32(rp) + std r8, -40(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) +ifdef(`HAVE_ABI_mode32', +` srdi r3, retval, 32 + mr r4, retval +',` mr r3, retval') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/lshiftc.asm b/gmp-6.3.0/mpn/powerpc64/lshiftc.asm new file mode 100644 index 0000000..7cf6a83 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/lshiftc.asm @@ -0,0 +1,210 @@ +dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt + +dnl Copyright 2003, 2005, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.5 +C POWER7 2.15 + +C TODO +C * Try to reduce the number of needed live registers +C * Micro-optimise header code +C * Keep in synch with lshift.asm and rshift.asm +C * Could the long-scheduled std insns be less scheduled? + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cnt', `r6') + +define(`tnc',`r0') +define(`u0',`r30') +define(`u1',`r31') +define(`retval',`r5') + +ASM_START() +PROLOGUE(mpn_lshiftc) + std r31, -8(r1) + std r30, -16(r1) + subfic tnc, cnt, 64 + sldi r7, n, 3 C byte count corresponding to n + add up, up, r7 C up = up + n + add rp, rp, r7 C rp = rp + n + rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 + cmpdi cr6, r30, 2 + addi r31, n, 3 C compute count... + ld r10, -8(up) C load 1st limb for b00...b11 + srd retval, r10, tnc + srdi r31, r31, 2 C ...for ctr + mtctr r31 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + ld r11, -16(up) C load 2nd limb for b10 and b11 + beq cr6, L(b10) + + ALIGN(16) +L(b11): sld r8, r10, cnt + srd r9, r11, tnc + ld u1, -24(up) + addi up, up, -24 + sld r12, r11, cnt + srd r7, u1, tnc + addi rp, rp, 16 + bdnz L(gt3) + + nor r11, r8, r9 + sld r8, u1, cnt + nor r8, r8, r8 + b L(cj3) + + ALIGN(16) +L(gt3): ld u0, -8(up) + nor r11, r8, r9 + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -16(up) + nor r10, r12, r7 + b L(L11) + + ALIGN(32) +L(b10): sld r12, r10, cnt + addi rp, rp, 24 + srd r7, r11, tnc + bdnz L(gt2) + + sld r8, r11, cnt + nor r10, r12, r7 + nor r8, r8, r8 + b L(cj2) + +L(gt2): ld u0, -24(up) + sld r8, r11, cnt + srd r9, u0, tnc + ld u1, -32(up) + nor r10, r12, r7 + sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -40(up) + nor r11, r8, r9 + addi up, up, -16 + b L(L10) + + ALIGN(16) +L(b00): ld u1, -16(up) + sld r12, r10, cnt + srd r7, u1, tnc + ld u0, -24(up) + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -32(up) + nor r10, r12, r7 + sld r12, u0, cnt + srd r7, u1, tnc + addi rp, rp, 8 + bdz L(cj4) + +L(gt4): addi up, up, -32 + ld u0, -8(up) + nor r11, r8, r9 + b L(L00) + + ALIGN(16) +L(b01): bdnz L(gt1) + sld r8, r10, cnt + nor r8, r8, r8 + std r8, -8(rp) + b L(ret) + +L(gt1): ld u0, -16(up) + sld r8, r10, cnt + srd r9, u0, tnc + ld u1, -24(up) + sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -32(up) + nor r11, r8, r9 + sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -40(up) + addi up, up, -40 + nor r10, r12, r7 + bdz L(end) + + ALIGN(32) +L(top): sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -8(up) + std r11, -8(rp) + nor r11, r8, r9 +L(L00): sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -16(up) + std r10, -16(rp) + nor r10, r12, r7 +L(L11): sld r12, u0, cnt + srd r7, u1, tnc + ld u0, -24(up) + std r11, -24(rp) + nor r11, r8, r9 +L(L10): sld r8, u1, cnt + srd r9, u0, tnc + ld u1, -32(up) + addi up, up, -32 + std r10, -32(rp) + addi rp, rp, -32 + nor r10, r12, r7 + bdnz L(top) + + ALIGN(32) +L(end): sld r12, u0, cnt + srd r7, u1, tnc + std r11, -8(rp) +L(cj4): nor r11, r8, r9 + sld r8, u1, cnt + std r10, -16(rp) + nor r8, r8, r8 +L(cj3): nor r10, r12, r7 + std r11, -24(rp) +L(cj2): std r10, -32(rp) + std r8, -40(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) +ifdef(`HAVE_ABI_mode32', +` srdi r3, retval, 32 + mr r4, retval +',` mr r3, retval') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode32/add_n.asm b/gmp-6.3.0/mpn/powerpc64/mode32/add_n.asm new file mode 100644 index 0000000..1da8087 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode32/add_n.asm @@ -0,0 +1,86 @@ +dnl PowerPC-64/mode32 mpn_add_n -- Add two limb vectors of the same length > 0 +dnl and store sum in a third limb vector. + +dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630: ? +C POWER4/PPC970: 4.25 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +ASM_START() +PROLOGUE(mpn_add_n) + mtctr r6 C copy size into CTR + addic r0, r0, 0 C clear cy + ld r8, 0(r4) C load least significant s1 limb + ld r0, 0(r5) C load least significant s2 limb + addi r3, r3, -8 C offset res_ptr, it's updated before it's used + bdz L(end) C If done, skip loop + +L(oop): ld r9, 8(r4) C load s1 limb + ld r10, 8(r5) C load s2 limb + adde r7, r0, r8 C add limbs with cy, set cy + srdi r6, r0, 32 + srdi r11, r8, 32 + adde r6, r6, r11 C add high limb parts, set cy + std r7, 8(r3) C store result limb + bdz L(exit) C decrement CTR and exit if done + ldu r8, 16(r4) C load s1 limb and update s1_ptr + ldu r0, 16(r5) C load s2 limb and update s2_ptr + adde r7, r10, r9 C add limbs with cy, set cy + srdi r6, r10, 32 + srdi r11, r9, 32 + adde r6, r6, r11 C add high limb parts, set cy + stdu r7, 16(r3) C store result limb and update res_ptr + bdnz L(oop) C decrement CTR and loop back + +L(end): adde r7, r0, r8 + srdi r6, r0, 32 + srdi r11, r8, 32 + adde r6, r6, r11 C add limbs with cy, set cy + std r7, 8(r3) C store ultimate result limb + li r3, 0 C load cy into ... + addze r4, r3 C ... return value register + blr +L(exit): adde r7, r10, r9 + srdi r6, r10, 32 + srdi r11, r9, 32 + adde r6, r6, r11 C add limbs with cy, set cy + std r7, 16(r3) + li r3, 0 C load cy into ... + addze r4, r3 C ... return value register + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode32/addmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode32/addmul_1.asm new file mode 100644 index 0000000..bdc3951 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode32/addmul_1.asm @@ -0,0 +1,79 @@ +dnl PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630: ? +C POWER4/PPC970: 12.5 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C v r6,r7 or r7,r8 + +ASM_START() +PROLOGUE(mpn_addmul_1) + +ifdef(`BROKEN_LONGLONG_PARAM', +` rldimi r8, r7, 32,0 C assemble vlimb from separate 32-bit arguments + mr r6, r8 +',` + rldimi r7, r6, 32,0 C assemble vlimb from separate 32-bit arguments + mr r6, r7 +') + li r7, 0 C cy_limb = 0 + mtctr r5 + addic r0, r0, 0 + addi r3, r3, -8 + addi r4, r4, -8 + +L(oop): ldu r0, 8(r4) + mulld r9, r0, r6 + adde r12, r9, r7 C add old high limb and new low limb + srdi r5, r9, 32 + srdi r11, r7, 32 + adde r5, r5, r11 C add high limb parts, set cy + mulhdu r7, r0, r6 + addze r7, r7 + ld r10, 8(r3) + addc r9, r12, r10 + srdi r5, r12, 32 + srdi r11, r10, 32 + adde r5, r5, r11 C add high limb parts, set cy + stdu r9, 8(r3) + bdnz L(oop) + + addze r4, r7 + srdi r3, r4, 32 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode32/mul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode32/mul_1.asm new file mode 100644 index 0000000..3a17e98 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode32/mul_1.asm @@ -0,0 +1,73 @@ +dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630: ? +C POWER4/PPC970: 10 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C v r6,r7 or r7,r8 + +ASM_START() +PROLOGUE(mpn_mul_1) + +ifdef(`BROKEN_LONGLONG_PARAM', +` rldimi r8, r7, 32,0 C assemble vlimb from separate 32-bit arguments + mr r6, r8 +',` + rldimi r7, r6, 32,0 C assemble vlimb from separate 32-bit arguments + mr r6, r7 +') + li r7, 0 C cy_limb = 0 + mtctr r5 + addic r0, r0, 0 + addi r3, r3, -8 + addi r4, r4, -8 + +L(oop): ldu r0, 8(r4) + mulld r9, r0, r6 + adde r12, r9, r7 C add old high limb and new low limb + srdi r5, r9, 32 + srdi r11, r7, 32 + adde r5, r5, r11 C add high limb parts, set cy + mulhdu r7, r0, r6 + stdu r12, 8(r3) + bdnz L(oop) + + addze r4, r7 + srdi r3, r4, 32 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode32/p4/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode32/p4/gmp-mparam.h new file mode 100644 index 0000000..4e805a0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode32/p4/gmp-mparam.h @@ -0,0 +1,182 @@ +/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2008, 2009, 2011, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* 1800 MHz PPC970 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2017-01-01, gcc 4.0 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 6 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 46 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 2 +#define DIV_QR_2_PI2_THRESHOLD 15 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 88 + +#define DIV_1_VS_MUL_1_PERCENT 269 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 60 +#define MUL_TOOM44_THRESHOLD 88 +#define MUL_TOOM6H_THRESHOLD 124 +#define MUL_TOOM8H_THRESHOLD 187 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 61 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 61 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 60 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 74 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 90 +#define SQR_TOOM4_THRESHOLD 143 +#define SQR_TOOM6_THRESHOLD 181 +#define SQR_TOOM8_THRESHOLD 272 + +#define MULMID_TOOM42_THRESHOLD 34 + +#define MULMOD_BNM1_THRESHOLD 10 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 252 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 252, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 13, 5}, { 27, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 21, 9}, { 11, 8}, { 27,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23, 8}, { 47, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 95, 8}, { 191,10}, { 55,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ + { 71, 9}, { 143, 8}, { 287,10}, { 79, 9}, \ + { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415,10}, { 223, 9}, \ + { 447,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351, 9}, { 703,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,11}, \ + { 223,10}, { 447,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 105 +#define MUL_FFT_THRESHOLD 5248 + +#define SQR_FFT_MODF_THRESHOLD 236 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 236, 5}, { 13, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 17, 8}, \ + { 9, 7}, { 20, 8}, { 11, 7}, { 24, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95, 8}, { 191,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319,11}, \ + { 47,10}, { 95, 9}, { 191, 8}, { 383,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287, 8}, { 575,11}, \ + { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \ + { 175, 9}, { 351, 8}, { 703,11}, { 95,10}, \ + { 191, 9}, { 383, 8}, { 767,10}, { 207, 9}, \ + { 415,10}, { 223,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,11}, \ + { 223,10}, { 447,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 97 +#define SQR_FFT_THRESHOLD 3200 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 56 +#define MULLO_MUL_N_THRESHOLD 8648 +#define SQRLO_BASECASE_THRESHOLD 2 +#define SQRLO_DC_THRESHOLD 106 +#define SQRLO_SQR_THRESHOLD 6293 + +#define DC_DIV_QR_THRESHOLD 28 +#define DC_DIVAPPR_Q_THRESHOLD 102 +#define DC_BDIV_QR_THRESHOLD 51 +#define DC_BDIV_Q_THRESHOLD 124 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 123 +#define INV_APPR_THRESHOLD 109 + +#define BINV_NEWTON_THRESHOLD 206 +#define REDC_1_TO_REDC_N_THRESHOLD 51 + +#define MU_DIV_QR_THRESHOLD 807 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 53 +#define MU_BDIV_QR_THRESHOLD 748 +#define MU_BDIV_Q_THRESHOLD 872 + +#define POWM_SEC_TABLE 2,23,66,440,1555 + +#define GET_STR_DC_THRESHOLD 7 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 1035 +#define SET_STR_PRECOMPUTE_THRESHOLD 2170 + +#define FAC_DSC_THRESHOLD 542 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 10 +#define HGCD_THRESHOLD 108 +#define HGCD_APPR_THRESHOLD 116 +#define HGCD_REDUCE_THRESHOLD 1437 +#define GCD_DC_THRESHOLD 268 +#define GCDEXT_DC_THRESHOLD 241 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc64/mode32/sqr_diagonal.asm b/gmp-6.3.0/mpn/powerpc64/mode32/sqr_diagonal.asm new file mode 100644 index 0000000..ff5f4b3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode32/sqr_diagonal.asm @@ -0,0 +1,117 @@ +dnl PowerPC-64 mpn_sqr_diagonal. + +dnl Copyright 2001-2003, 2005, 2006, 20010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 18 +C POWER4/PPC970 ? +C POWER5 7.25 +C POWER6 9.5 + +C INPUT PARAMETERS +define(`rp', r3) +define(`up', r4) +define(`n', r5) + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) +ifdef(`HAVE_ABI_mode32', +` rldicl n, n, 0, 32') C zero extend n + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + addi n, n, 3 C compute count... + cmpdi cr6, r0, 2 + srdi n, n, 2 C ...for ctr + mtctr n C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r0, 0(up) + ld r10, 8(up) + ld r12, 16(up) + addi rp, rp, -16 + mulld r7, r0, r0 + mulhdu r8, r0, r0 + mulld r9, r10, r10 + mulhdu r10, r10, r10 + mulld r11, r12, r12 + mulhdu r12, r12, r12 + addi up, up, 24 + b L(11) + + ALIGN(16) +L(b01): ld r0, 0(up) + addi rp, rp, -48 + addi up, up, 8 + mulld r11, r0, r0 + mulhdu r12, r0, r0 + b L(01) + + ALIGN(16) +L(b10): ld r0, 0(up) + ld r12, 8(up) + addi rp, rp, -32 + addi up, up, 16 + mulld r9, r0, r0 + mulhdu r10, r0, r0 + mulld r11, r12, r12 + mulhdu r12, r12, r12 + b L(10) + + ALIGN(32) +L(b00): +L(top): ld r0, 0(up) + ld r8, 8(up) + ld r10, 16(up) + ld r12, 24(up) + mulld r5, r0, r0 + mulhdu r6, r0, r0 + mulld r7, r8, r8 + mulhdu r8, r8, r8 + mulld r9, r10, r10 + mulhdu r10, r10, r10 + mulld r11, r12, r12 + mulhdu r12, r12, r12 + addi up, up, 32 + std r5, 0(rp) + std r6, 8(rp) +L(11): std r7, 16(rp) + std r8, 24(rp) +L(10): std r9, 32(rp) + std r10, 40(rp) +L(01): std r11, 48(rp) + std r12, 56(rp) + addi rp, rp, 64 + bdnz L(top) + + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode32/sub_n.asm b/gmp-6.3.0/mpn/powerpc64/mode32/sub_n.asm new file mode 100644 index 0000000..6fdc1d4 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode32/sub_n.asm @@ -0,0 +1,88 @@ +dnl PowerPC-64/mode32 mpn_sub_n -- Subtract two limb vectors of the same +dnl length and store difference in a third limb vector. + +dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630: ? +C POWER4/PPC970: 4.25 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +ASM_START() +PROLOGUE(mpn_sub_n) + mtctr r6 C copy size into CTR + addic r0, r6, -1 C set cy + ld r8, 0(r4) C load least significant s1 limb + ld r0, 0(r5) C load least significant s2 limb + addi r3, r3, -8 C offset res_ptr, it's updated before it's used + bdz L(end) C If done, skip loop + +L(oop): ld r9, 8(r4) C load s1 limb + ld r10, 8(r5) C load s2 limb + subfe r7, r0, r8 C subtract limbs with cy, set cy + srdi r6, r0, 32 + srdi r11, r8, 32 + subfe r6, r6, r11 + std r7, 8(r3) C store result limb + bdz L(exit) C decrement CTR and exit if done + ldu r8, 16(r4) C load s1 limb and update s1_ptr + ldu r0, 16(r5) C load s2 limb and update s2_ptr + subfe r7, r10, r9 C subtract limbs with cy, set cy + srdi r6, r10, 32 + srdi r11, r9, 32 + subfe r6, r6, r11 + stdu r7, 16(r3) C store result limb and update res_ptr + bdnz L(oop) C decrement CTR and loop back + +L(end): subfe r7, r0, r8 + srdi r6, r0, 32 + srdi r11, r8, 32 + subfe r6, r6, r11 + std r7, 8(r3) C store ultimate result limb + subfe r3, r0, r0 C load !cy into ... + subfic r4, r3, 0 C ... return value register + li r3, 0 C zero extend return value + blr +L(exit): subfe r7, r10, r9 + srdi r6, r10, 32 + srdi r11, r9, 32 + subfe r6, r6, r11 + std r7, 16(r3) + subfe r3, r0, r0 C load !cy into ... + subfic r4, r3, 0 C ... return value register + li r3, 0 C zero extend return value + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode32/submul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode32/submul_1.asm new file mode 100644 index 0000000..996eda2 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode32/submul_1.asm @@ -0,0 +1,81 @@ +dnl PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630: ? +C POWER4/PPC970: 16 + +C INPUT PARAMETERS +C rp r3 +C up r4 +C n r5 +C v r6,r7 or r7,r8 + +ASM_START() +PROLOGUE(mpn_submul_1) + +ifdef(`BROKEN_LONGLONG_PARAM', +` rldimi r8, r7, 32,0 C assemble vlimb from separate 32-bit arguments + mr r6, r8 +',` + rldimi r7, r6, 32,0 C assemble vlimb from separate 32-bit arguments + mr r6, r7 +') + li r7, 0 C cy_limb = 0 + mtctr r5 + addic r0, r0, 0 + addi r3, r3, -8 + addi r4, r4, -8 + +L(oop): ldu r0, 8(r4) + mulld r9, r0, r6 + adde r12, r9, r7 C add old high limb and new low limb + srdi r5, r9, 32 + srdi r11, r7, 32 + adde r5, r5, r11 C add high limb parts, set cy + mulhdu r7, r0, r6 + addze r7, r7 + ld r10, 8(r3) + subfc r9, r12, r10 + srdi r5, r12, 32 + srdi r11, r10, 32 + subfe r5, r5, r11 C subtract high limb parts, set cy + stdu r9, 8(r3) + subfe r11, r11, r11 C invert ... + addic r11, r11, 1 C ... carry + bdnz L(oop) + + addze r4, r7 + srdi r3, r4, 32 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm new file mode 100644 index 0000000..0e8474f --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aors_n.asm @@ -0,0 +1,189 @@ +dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.5 +C POWER4/PPC970 2 +C POWER5 2 +C POWER6 2.63 +C POWER7 2.25-2.87 + +C This code is a little bit slower for POWER3/PPC630 than the simple code used +C previously, but it is much faster for POWER4/PPC970. The reason for the +C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4 +C registers. + +C INPUT PARAMETERS +C rp r3 +C up r4 +C vp r5 +C n r6 + +ifdef(`OPERATION_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + SETCBR(r7) + b L(ent) +EPILOGUE() + +PROLOGUE(func) + CLRCB +L(ent): std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + + rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi r6, r6, 3 C compute count... + srdi r6, r6, 2 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r8, 0(r4) C load s1 limb + ld r9, 0(r5) C load s2 limb + ld r10, 8(r4) C load s1 limb + ld r11, 8(r5) C load s2 limb + ld r12, 16(r4) C load s1 limb + addi r4, r4, 24 + ld r0, 16(r5) C load s2 limb + addi r5, r5, 24 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r29, 0(r3) + std r30, 8(r3) + std r31, 16(r3) + addi r3, r3, 24 + bdnz L(go) + b L(ret) + +L(b01): ld r12, 0(r4) C load s1 limb + addi r4, r4, 8 + ld r0, 0(r5) C load s2 limb + addi r5, r5, 8 + ADDSUBC r31, r0, r12 C add + std r31, 0(r3) + addi r3, r3, 8 + bdnz L(go) + b L(ret) + +L(b10): ld r10, 0(r4) C load s1 limb + ld r11, 0(r5) C load s2 limb + ld r12, 8(r4) C load s1 limb + addi r4, r4, 16 + ld r0, 8(r5) C load s2 limb + addi r5, r5, 16 + ADDSUBC r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + std r30, 0(r3) + std r31, 8(r3) + addi r3, r3, 16 + bdnz L(go) + b L(ret) + +L(b00): C INITCY C clear/set cy +L(go): ld r6, 0(r4) C load s1 limb + ld r7, 0(r5) C load s2 limb + ld r8, 8(r4) C load s1 limb + ld r9, 8(r5) C load s2 limb + ld r10, 16(r4) C load s1 limb + ld r11, 16(r5) C load s2 limb + ld r12, 24(r4) C load s1 limb + ld r0, 24(r5) C load s2 limb + bdz L(end) + + addi r4, r4, 32 + addi r5, r5, 32 + + ALIGN(16) +L(top): ADDSUBC r28, r7, r6 + ld r6, 0(r4) C load s1 limb + ld r7, 0(r5) C load s2 limb + ADDSUBC r29, r9, r8 + ld r8, 8(r4) C load s1 limb + ld r9, 8(r5) C load s2 limb + ADDSUBC r30, r11, r10 + ld r10, 16(r4) C load s1 limb + ld r11, 16(r5) C load s2 limb + ADDSUBC r31, r0, r12 + ld r12, 24(r4) C load s1 limb + ld r0, 24(r5) C load s2 limb + std r28, 0(r3) + addi r4, r4, 32 + std r29, 8(r3) + addi r5, r5, 32 + std r30, 16(r3) + std r31, 24(r3) + addi r3, r3, 32 + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r7, r6 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r28, 0(r3) + std r29, 8(r3) + std r30, 16(r3) + std r31, 24(r3) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + + subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm new file mode 100644 index 0000000..0c12f9b --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsmul_1.asm @@ -0,0 +1,225 @@ +dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 1999-2001, 2003-2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 6-18 6-18 +C POWER4/PPC970 8 8.3 +C POWER5 8 8.25 +C POWER6 16.25 16.75 +C POWER7 3.77 4.9 + +C TODO +C * Try to reduce the number of needed live registers +C * Add support for _1c entry points + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`vl', `r6') + +ifdef(`OPERATION_addmul_1',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addmul_1) + define(func_nc, mpn_addmul_1c) C FIXME: not really supported + define(SM, `') +') +ifdef(`OPERATION_submul_1',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_submul_1) + define(func_nc, mpn_submul_1c) C FIXME: not really supported + define(SM, `$1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + std r30, -16(r1) + cmpdi cr6, r0, 2 + std r29, -24(r1) + addi n, n, 3 C compute count... + std r28, -32(r1) + srdi n, n, 2 C ...for ctr + std r27, -40(r1) + mtctr n C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r9, 0(up) + ld r28, 0(rp) + mulld r0, r9, r6 + mulhdu r12, r9, r6 + ADDSUB r0, r0, r28 + std r0, 0(rp) + addi rp, rp, 8 + ld r9, 8(up) + ld r27, 16(up) + addi up, up, 24 +SM(` subfe r11, r11, r11 ') + b L(bot) + + ALIGN(16) +L(b00): ld r9, 0(up) + ld r27, 8(up) + ld r28, 0(rp) + ld r29, 8(rp) + mulld r0, r9, r6 + mulhdu r5, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r7, r7, r5 + addze r12, r8 + ADDSUB r0, r0, r28 + std r0, 0(rp) + ADDSUBC r7, r7, r29 + std r7, 8(rp) + addi rp, rp, 16 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 +SM(` subfe r11, r11, r11 ') + b L(bot) + + ALIGN(16) +L(b01): bdnz L(gt1) + ld r9, 0(up) + ld r11, 0(rp) + mulld r0, r9, r6 + mulhdu r8, r9, r6 + ADDSUB r0, r0, r11 + std r0, 0(rp) +SM(` subfe r11, r11, r11 ') +SM(` addic r11, r11, 1 ') + addze r3, r8 + blr +L(gt1): ld r9, 0(up) + ld r27, 8(up) + mulld r0, r9, r6 + mulhdu r5, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 16(up) + ld r28, 0(rp) + ld r29, 8(rp) + ld r30, 16(rp) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r5 + adde r11, r11, r8 + addze r12, r10 + ADDSUB r0, r0, r28 + std r0, 0(rp) + ADDSUBC r7, r7, r29 + std r7, 8(rp) + ADDSUBC r11, r11, r30 + std r11, 16(rp) + addi rp, rp, 24 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 +SM(` subfe r11, r11, r11 ') + b L(bot) + +L(b10): addic r0, r0, 0 + li r12, 0 C cy_limb = 0 + ld r9, 0(up) + ld r27, 8(up) + bdz L(end) + addi up, up, 16 + + ALIGN(16) +L(top): mulld r0, r9, r6 + mulhdu r5, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r5 C 5 7 + mulld r5, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r5, r5, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + ADDSUB r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + ADDSUBC r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + ADDSUBC r5, r5, r30 C 5 30 + std r5, 16(rp) C 5 + ADDSUBC r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 +SM(` subfe r11, r11, r11 ') + addi rp, rp, 32 +L(bot): +SM(` addic r11, r11, 1 ') + bdnz L(top) + +L(end): mulld r0, r9, r6 + mulhdu r5, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r5 + addze r8, r8 + ADDSUB r0, r0, r28 + std r0, 0(rp) + ADDSUBC r7, r7, r29 + std r7, 8(rp) +SM(` subfe r11, r11, r11 ') +SM(` addic r11, r11, 1 ') + addze r3, r8 + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm new file mode 100644 index 0000000..2c5400a --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh1_n.asm @@ -0,0 +1,43 @@ +dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm new file mode 100644 index 0000000..447791a --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlsh2_n.asm @@ -0,0 +1,43 @@ +dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm new file mode 100644 index 0000000..6158f54 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/aorsorrlshC_n.asm @@ -0,0 +1,187 @@ +dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C POWER3/PPC630 1.83 (1.5 c/l should be possible) +C POWER4/PPC970 3 (2.0 c/l should be possible) +C POWER5 3 +C POWER6 3.5-47 +C POWER7 3 + +C STATUS +C * Try combining upx+up, and vpx+vp. +C * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is +C greater than the 2nd operand. Yes, this addition is non-commutative wrt +C performance. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ifdef(`DO_add', ` + define(`ADDSUBC', `addc $1, $2, $3') + define(`ADDSUBE', `adde $1, $2, $3') + define(INITCY, `addic $1, r1, 0') + define(RETVAL, `addze r3, $1') + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUBC', `subfc $1, $2, $3') + define(`ADDSUBE', `subfe $1, $2, $3') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `subfze r3, $1 + neg r3, r3') + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUBC', `subfc $1, $3, $2') + define(`ADDSUBE', `subfe $1, $3, $2') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `addme r3, $1') + define(`func', mpn_rsblsh`'LSH`'_n)') + +define(`rpx', `r6') +define(`upx', `r7') +define(`vpx', `r12') + +define(`s0', `r0') define(`s1', `r9') +define(`u0', `r8') +define(`v0', `r10') define(`v1', `r11') + + +ASM_START() +PROLOGUE(func) + cmpldi cr0, n, 13 + bgt L(big) + + mtctr n C copy n in ctr + INITCY( r0) C clear cy + + ld v0, 0(vp) C load v limb + ld u0, 0(up) C load u limb + addi up, up, -8 C update up + addi rp, rp, -8 C update rp + sldi s1, v0, LSH + bdz L(ex1) C If done, skip loop + + ALIGN(16) +L(lo0): ld v1, 8(vp) C load v limb + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy + ldu u0, 16(up) C load u limb and update up + srdi s0, v0, RSH C shift down previous v limb + std s1, 8(rp) C store result limb + rldimi s0, v1, LSH, 0 C left shift v limb and merge with prev v limb + bdz L(ex0) C decrement ctr and exit if done + ldu v0, 16(vp) C load v limb and update vp + ADDSUBE(s0, s0, u0) C add limbs with cy, set cy + ld u0, 8(up) C load u limb + srdi s1, v1, RSH C shift down previous v limb + stdu s0, 16(rp) C store result limb and update rp + rldimi s1, v0, LSH, 0 C left shift v limb and merge with prev v limb + bdnz L(lo0) C decrement ctr and loop back + +L(ex1): ADDSUBE(r7, s1, u0) + std r7, 8(rp) C store last result limb + srdi r0, v0, RSH + RETVAL( r0) + blr +L(ex0): ADDSUBE(r7, s0, u0) + std r7, 16(rp) C store last result limb + srdi r0, v1, RSH + RETVAL( r0) + blr + + +L(big): rldicl. r0, n, 0,63 C r0 = n & 1, set cr0 + addi r6, n, -1 C ...for ctr + srdi r6, r6, 1 C ...for ctr + mtctr r6 C copy count into ctr + beq cr0, L(b0) + +L(b1): ld v1, 0(vp) + ld u0, 0(up) + sldi s1, v1, LSH + srdi s0, v1, RSH + ld v0, 8(vp) + ADDSUBC(s1, s1, u0) C add limbs without cy, set cy + addi rpx, rp, -16 + addi rp, rp, -8 + sub upx, up, rp + sub vpx, vp, rp + sub up, up, rpx + sub vp, vp, rpx + addi up, up, 8 + addi upx, upx, 16 + addi vp, vp, 16 + addi vpx, vpx, 24 + b L(mid) + +L(b0): ld v0, 0(vp) + ld u0, 0(up) + sldi s0, v0, LSH + srdi s1, v0, RSH + ld v1, 8(vp) + ADDSUBC(s0, s0, u0) C add limbs without cy, set cy + addi rpx, rp, -8 + addi rp, rp, -16 + sub upx, up, rpx + sub vpx, vp, rpx + sub up, up, rp + sub vp, vp, rp + addi up, up, 8 + addi upx, upx, 16 + addi vp, vp, 16 + addi vpx, vpx, 24 + + ALIGN(32) +L(top): ldx u0, rp, up + ldx v0, rp, vp + rldimi s1, v1, LSH, 0 + stdu s0, 16(rp) + srdi s0, v1, RSH + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy +L(mid): ldx u0, rpx, upx + ldx v1, rpx, vpx + rldimi s0, v0, LSH, 0 + stdu s1, 16(rpx) + srdi s1, v0, RSH + ADDSUBE(s0, s0, u0) C add limbs with cy, set cy + bdnz L(top) C decrement CTR and loop back + + ldx u0, rp, up + rldimi s1, v1, LSH, 0 + std s0, 16(rp) + srdi s0, v1, RSH + ADDSUBE(s1, s1, u0) C add limbs with cy, set cy + std s1, 24(rp) + + RETVAL( s0) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm new file mode 100644 index 0000000..45cded9 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_dbm1c.asm @@ -0,0 +1,132 @@ +dnl PPC64 mpn_bdiv_dbm1c. + +dnl Copyright 2008, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8.25 +C POWER5 8.5 fluctuating as function of n % 3 +C POWER6 15 +C POWER7 4.75 + +C TODO +C * Nothing to do... + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`bd', `r6') +define(`cy', `r7') + +ASM_START() +PROLOGUE(mpn_bdiv_dbm1c) + ld r0, 0(r4) + + rldicl. r12, r5, 0,62 + cmpldi cr6, r12, 2 + cmpldi cr7, r5, 4 + addi r5, r5, 1 + srwi r5, r5, 2 + mtctr r5 + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + + ALIGN(16) +L(b11): mulld r5, r0, r6 + mulhdu r12, r0, r6 + ld r0, 8(r4) + addi r4, r4, -24 + addi r3, r3, -24 + b L(3) + + ALIGN(16) +L(b00): mulld r9, r0, r6 + mulhdu r8, r0, r6 + addi r4, r4, -16 + addi r3, r3, -16 + b L(0) + + ALIGN(16) +L(b01): mulld r5, r0, r6 + mulhdu r12, r0, r6 + addi r3, r3, -8 + ble cr7, L(e1) + ld r0, 8(r4) + addi r4, r4, -8 + b L(1) + + ALIGN(16) +L(b10): mulld r9, r0, r6 + mulhdu r8, r0, r6 + ble cr7, L(e2) + + ALIGN(16) +L(top): subfc r11, r9, r7 + ld r10, 8(r4) + ld r0, 16(r4) + subfe r7, r8, r11 + std r11, 0(r3) + mulld r5, r10, r6 + mulhdu r12, r10, r6 +L(1): mulld r9, r0, r6 + mulhdu r8, r0, r6 + subfc r11, r5, r7 + subfe r7, r12, r11 + std r11, 8(r3) +L(0): subfc r11, r9, r7 + ld r10, 24(r4) + ld r0, 32(r4) + subfe r7, r8, r11 + std r11, 16(r3) + mulld r5, r10, r6 + mulhdu r12, r10, r6 +L(3): mulld r9, r0, r6 + mulhdu r8, r0, r6 + subfc r11, r5, r7 + subfe r7, r12, r11 + std r11, 24(r3) + addi r4, r4, 32 + addi r3, r3, 32 + bdnz L(top) + +L(e2): ld r10, 8(r4) + mulld r5, r10, r6 + mulhdu r12, r10, r6 + subfc r11, r9, r7 + subfe r7, r8, r11 + std r11, 0(r3) +L(e1): subfc r11, r5, r7 + std r11, 8(r3) + subfe r3, r12, r11 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm new file mode 100644 index 0000000..307aafc --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/bdiv_q_1.asm @@ -0,0 +1,146 @@ +dnl PowerPC-64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb +dnl divisor. + +dnl Copyright 2006, 2010, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 16 +C POWER6 37 46 +C POWER7 12 12 +C POWER8 12 12 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`d', `r6') +define(`di', `r7') +define(`cnt',`r8') + +define(`tnc',`r10') + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_bdiv_q_1,toc) + addi r7, n, -1 + cmpdi cr1, n, 1 + ld r12, 0(up) + li cnt, 0 + neg r0, d + and r0, d, r0 + cntlzd r0, r0 + subfic cnt, r0, 63 + srd d, d, cnt +L(7): + mtctr r7 + LEA( r10, binvert_limb_table) + rldicl r11, d, 63, 57 + lbzx r0, r10, r11 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r0, r9, r0 + mulld r10, r0, r0 + sldi r0, r0, 1 + mulld r10, d, r10 + subf r0, r10, r0 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf di, r9, r0 C di = 1/d mod 2^64 +ifdef(`AIX', +` C For AIX it is not clear how to jump into another function. + b .mpn_pi1_bdiv_q_1 +',` + C For non-AIX, dispatch into the pi1 variant. + bne cr0, L(norm) + b L(unorm) +') +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + cmpdi cr0, cnt, 0 + ld r12, 0(up) + addic r0, n, -1 C set carry as side effect + cmpdi cr1, n, 1 + mtctr r0 + beq cr0, L(norm) + +L(unorm): + subfic tnc, cnt, 64 C set carry as side effect + li r5, 0 + srd r11, r12, cnt + beq cr1, L(ed1) + + ALIGN(16) +L(tpu): ld r12, 8(up) + nop + addi up, up, 8 + sld r0, r12, tnc + or r11, r11, r0 + subfe r9, r5, r11 + srd r11, r12, cnt + mulld r0, di, r9 + mulhdu r5, r0, d + std r0, 0(rp) + addi rp, rp, 8 + bdnz L(tpu) + + subfe r11, r5, r11 +L(ed1): mulld r0, di, r11 + std r0, 0(rp) + blr + + ALIGN(16) +L(norm): + mulld r11, r12, di + mulhdu r5, r11, d + std r11, 0(rp) + beqlr cr1 + + ALIGN(16) +L(tpn): ld r9, 8(up) + addi up, up, 8 + subfe r5, r5, r9 + mulld r11, di, r5 + mulhdu r5, r11, d C result not used in last iteration + std r11, 8(rp) + addi rp, rp, 8 + bdnz L(tpn) + + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm new file mode 100644 index 0000000..24968c1 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/cnd_aors_n.asm @@ -0,0 +1,196 @@ +dnl PowerPC-64 mpn_cnd_add_n/mpn_cnd_sub_n. + +dnl Copyright 1999-2001, 2003-2005, 2007, 2011, 2012 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 2.25 +C POWER5 ? +C POWER6 3 +C POWER7 2 + +C INPUT PARAMETERS +define(`cnd', `r3') +define(`rp', `r4') +define(`up', `r5') +define(`vp', `r6') +define(`n', `r7') + +ifdef(`OPERATION_cnd_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_cnd_add_n) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_cnd_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_cnd_sub_n) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + subfic cnd, cnd, 0 + subfe cnd, cnd, cnd + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): ld r8, 0(up) C load s1 limb + ld r9, 0(vp) C load s2 limb + ld r10, 8(up) C load s1 limb + ld r11, 8(vp) C load s2 limb + ld r12, 16(up) C load s1 limb + addi up, up, 24 + ld r0, 16(vp) C load s2 limb + addi vp, vp, 24 + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + ADDSUB r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r29, 0(rp) + std r30, 8(rp) + std r31, 16(rp) + addi rp, rp, 24 + bdnz L(go) + b L(ret) + +L(b01): ld r12, 0(up) C load s1 limb + addi up, up, 8 + ld r0, 0(vp) C load s2 limb + addi vp, vp, 8 + and r0, r0, cnd + ADDSUB r31, r0, r12 C add + std r31, 0(rp) + addi rp, rp, 8 + bdnz L(go) + b L(ret) + +L(b10): ld r10, 0(up) C load s1 limb + ld r11, 0(vp) C load s2 limb + ld r12, 8(up) C load s1 limb + addi up, up, 16 + ld r0, 8(vp) C load s2 limb + addi vp, vp, 16 + and r11, r11, cnd + and r0, r0, cnd + ADDSUB r30, r11, r10 C add + ADDSUBC r31, r0, r12 C add + std r30, 0(rp) + std r31, 8(rp) + addi rp, rp, 16 + bdnz L(go) + b L(ret) + +L(b00): CLRCB C clear/set cy +L(go): ld r7, 0(up) C load s1 limb + ld r27, 0(vp) C load s2 limb + ld r8, 8(up) C load s1 limb + ld r9, 8(vp) C load s2 limb + ld r10, 16(up) C load s1 limb + ld r11, 16(vp) C load s2 limb + ld r12, 24(up) C load s1 limb + ld r0, 24(vp) C load s2 limb + and r27, r27, cnd + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + bdz L(end) + + addi up, up, 32 + addi vp, vp, 32 + +L(top): ADDSUBC r28, r27, r7 + ld r7, 0(up) C load s1 limb + ld r27, 0(vp) C load s2 limb + ADDSUBC r29, r9, r8 + ld r8, 8(up) C load s1 limb + ld r9, 8(vp) C load s2 limb + ADDSUBC r30, r11, r10 + ld r10, 16(up) C load s1 limb + ld r11, 16(vp) C load s2 limb + ADDSUBC r31, r0, r12 + ld r12, 24(up) C load s1 limb + ld r0, 24(vp) C load s2 limb + std r28, 0(rp) + addi up, up, 32 + std r29, 8(rp) + addi vp, vp, 32 + std r30, 16(rp) + std r31, 24(rp) + addi rp, rp, 32 + and r27, r27, cnd + and r9, r9, cnd + and r11, r11, cnd + and r0, r0, cnd + bdnz L(top) C decrement ctr and loop back + +L(end): ADDSUBC r28, r27, r7 + ADDSUBC r29, r9, r8 + ADDSUBC r30, r11, r10 + ADDSUBC r31, r0, r12 + std r28, 0(rp) + std r29, 8(rp) + std r30, 16(rp) + std r31, 24(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + + subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm new file mode 100644 index 0000000..c2d10bd --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/dive_1.asm @@ -0,0 +1,135 @@ +dnl PowerPC-64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2006, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 16 +C POWER6 37 46 +C POWER7 12 12 +C POWER8 12 12 + +C TODO +C * Check if n=1 code is really an improvement. It probably isn't. +C * Make more similar to mode1o.asm. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`d', `r6') + + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_divexact_1,toc) + addic. n, n, -1 + ld r12, 0(up) + bne cr0, L(2) + divdu r0, r12, d + std r0, 0(rp) + blr +L(2): + rldicl. r0, d, 0, 63 + li r10, 0 + bne cr0, L(7) + neg r0, d + and r0, d, r0 + cntlzd r0, r0 + subfic r0, r0, 63 + rldicl r10, r0, 0, 32 + srd d, d, r0 +L(7): + mtctr n + LEA( r5, binvert_limb_table) + rldicl r11, d, 63, 57 + lbzx r0, r5, r11 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r0, r9, r0 + mulld r5, r0, r0 + sldi r0, r0, 1 + mulld r5, d, r5 + subf r0, r5, r0 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r7, r9, r0 C r7 = 1/d mod 2^64 + + bne cr0, L(norm) + subfic r8, r10, 64 C set carry as side effect + li r5, 0 + srd r11, r12, r10 + + ALIGN(16) +L(loop0): + ld r12, 8(up) + nop + addi up, up, 8 + sld r0, r12, r8 + or r11, r11, r0 + subfe r9, r5, r11 + srd r11, r12, r10 + mulld r0, r7, r9 + mulhdu r5, r0, d + std r0, 0(rp) + addi rp, rp, 8 + bdnz L(loop0) + + subfe r0, r5, r11 + mulld r0, r7, r0 + std r0, 0(rp) + blr + + ALIGN(16) +L(norm): + mulld r11, r12, r7 + mulhdu r5, r11, d + std r11, 0(rp) + ALIGN(16) +L(loop1): + ld r9, 8(up) + addi up, up, 8 + subfe r5, r5, r9 + mulld r11, r7, r5 + mulhdu r5, r11, d C result not used in last iteration + std r11, 8(rp) + addi rp, rp, 8 + bdnz L(loop1) + + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm new file mode 100644 index 0000000..b283877 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_1.asm @@ -0,0 +1,274 @@ +dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb. + +dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm frac +C POWER3/PPC630 16-34 16-34 ~11 outdated figures +C POWER4/PPC970 28 28 19 +C POWER5 29 29 ~19 +C POWER6 49 59 ~42 +C POWER7 24.5 23 ~14 + +C INPUT PARAMETERS +C qp = r3 +C fn = r4 +C up = r5 +C un = r6 +C d = r7 + +C We use a not very predictable branch in the frac code, therefore the cycle +C count wobbles somewhat. With the alternative branch-free code, things run +C considerably slower on POWER4/PPC970 and POWER5. + +C Add preinv entry point. + + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_divrem_1,toc) + + mfcr r12 + add. r10, r6, r4 + std r25, -56(r1) + mr r25, r4 + mflr r0 + std r26, -48(r1) + mr r26, r5 + std r28, -32(r1) + mr r28, r6 + std r29, -24(r1) + mr r29, r3 + li r3, 0 + std r30, -16(r1) + mr r30, r7 + std r31, -8(r1) + li r31, 0 + std r27, -40(r1) + std r0, 16(r1) + stw r12, 8(r1) + stdu r1, -176(r1) + beq- cr0, L(1) + cmpdi cr7, r7, 0 + sldi r0, r10, 3 + add r11, r0, r29 + addi r29, r11, -8 + blt- cr7, L(162) + cmpdi cr4, r6, 0 + beq+ cr4, L(71) +L(163): + sldi r9, r6, 3 + add r9, r9, r5 + ld r7, -8(r9) + cmpld cr7, r7, r30 + bge- cr7, L(71) + cmpdi cr7, r10, 1 + li r0, 0 + mr r31, r7 + std r0, -8(r11) + addi r29, r29, -8 + mr r3, r7 + beq- cr7, L(1) + addi r28, r6, -1 + cmpdi cr4, r28, 0 +L(71): + cntlzd r27, r30 + sld r30, r30, r27 + sld r31, r31, r27 + mr r3, r30 + CALL( mpn_invert_limb) + beq- cr4, L(110) + sldi r9, r28, 3 + addic. r6, r28, -2 + add r9, r9, r26 + subfic r5, r27, 64 + ld r8, -8(r9) + srd r0, r8, r5 + or r31, r31, r0 + sld r7, r8, r27 + blt- cr0, L(154) + addi r28, r28, -1 + mtctr r28 + sldi r6, r6, 3 + ALIGN(16) +L(uloop): + ldx r8, r26, r6 + nop + mulld r0, r31, r3 + mulhdu r10, r31, r3 + addi r11, r31, 1 + srd r9, r8, r5 + addi r6, r6, -8 + or r9, r7, r9 + addc r0, r0, r9 + adde r10, r10, r11 + mulld r31, r10, r30 + subf r31, r31, r9 + subfc r0, r31, r0 C r <= ql + subfe r0, r0, r0 C r0 = -(r <= ql) + and r9, r30, r0 + add r31, r31, r9 + add r10, r0, r10 C qh -= (r >= ql) + cmpld cr7, r31, r30 + bge- cr7, L(164) +L(123): + std r10, 0(r29) + addi r29, r29, -8 + sld r7, r8, r27 + bdnz L(uloop) +L(154): + addi r11, r31, 1 + nop + mulld r0, r31, r3 + mulhdu r8, r31, r3 + addc r0, r0, r7 + adde r8, r8, r11 + mulld r31, r8, r30 + subf r31, r31, r7 + subfc r0, r0, r31 C r >= ql + subfe r0, r0, r0 C r0 = -(r >= ql) + not r7, r0 + add r8, r7, r8 C qh -= (r >= ql) + andc r0, r30, r0 + add r31, r31, r0 + cmpld cr7, r31, r30 + bge- cr7, L(165) +L(134): + std r8, 0(r29) + addi r29, r29, -8 +L(110): + addic. r0, r25, -1 + blt- cr0, L(156) + mtctr r25 + neg r9, r30 + ALIGN(16) +L(ufloop): + addi r11, r31, 1 + nop + mulld r0, r3, r31 + mulhdu r10, r3, r31 + add r10, r10, r11 + mulld r31, r9, r10 +ifelse(0,1,` + subfc r0, r0, r31 + subfe r0, r0, r0 C r0 = -(r >= ql) + not r7, r0 + add r10, r7, r10 C qh -= (r >= ql) + andc r0, r30, r0 + add r31, r31, r0 +',` + cmpld cr7, r31, r0 + blt cr7, L(29) + add r31, r30, r31 + addi r10, r10, -1 +L(29): +') + std r10, 0(r29) + addi r29, r29, -8 + bdnz L(ufloop) +L(156): + srd r3, r31, r27 +L(1): + addi r1, r1, 176 + ld r0, 16(r1) + lwz r12, 8(r1) + mtlr r0 + ld r25, -56(r1) + ld r26, -48(r1) + mtcrf 8, r12 + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +L(162): + cmpdi cr7, r6, 0 + beq- cr7, L(8) + sldi r9, r6, 3 + addi r29, r29, -8 + add r9, r9, r5 + addi r28, r6, -1 + ld r31, -8(r9) + subfc r9, r7, r31 + li r9, 0 + adde r9, r9, r9 + neg r0, r9 + std r9, -8(r11) + and r0, r0, r7 + subf r31, r0, r31 +L(8): + mr r3, r30 + CALL( mpn_invert_limb) + li r27, 0 + addic. r6, r28, -1 + blt- cr0, L(110) + mtctr r28 + sldi r6, r6, 3 + ALIGN(16) +L(nloop): + addi r11, r31, 1 + ldx r8, r26, r6 + mulld r0, r31, r3 + mulhdu r10, r31, r3 + addi r6, r6, -8 + addc r0, r0, r8 + adde r10, r10, r11 + mulld r31, r10, r30 + subf r31, r31, r8 C r = nl - qh * d + subfc r0, r31, r0 C r <= ql + subfe r0, r0, r0 C r0 = -(r <= ql) + and r9, r30, r0 + add r31, r31, r9 + add r10, r0, r10 C qh -= (r >= ql) + cmpld cr7, r31, r30 + bge- cr7, L(167) +L(51): + std r10, 0(r29) + addi r29, r29, -8 + bdnz L(nloop) + b L(110) + +L(164): + subf r31, r30, r31 + addi r10, r10, 1 + b L(123) +L(167): + subf r31, r30, r31 + addi r10, r10, 1 + b L(51) +L(165): + subf r31, r30, r31 + addi r8, r8, 1 + b L(134) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm new file mode 100644 index 0000000..752c3d6 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/divrem_2.asm @@ -0,0 +1,187 @@ +dnl PPC-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm frac +C POWER3/PPC630 +C POWER4/PPC970 ? ? +C POWER5 37 ? +C POWER6 62 ? +C POWER6 30.5 ? + +C INPUT PARAMETERS +C qp = r3 +C fn = r4 +C up = r5 +C un = r6 +C dp = r7 + + +ifdef(`DARWIN',,` +define(`r2',`r31')') C FIXME! + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_divrem_2,toc) + mflr r0 + std r23, -72(r1) + std r24, -64(r1) + std r25, -56(r1) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + std r0, 16(r1) + stdu r1, -192(r1) + mr r24, r3 + mr r25, r4 + sldi r0, r6, 3 + add r26, r5, r0 + addi r26, r26, -24 + ld r30, 8(r7) + ld r28, 0(r7) + ld r29, 16(r26) + ld r31, 8(r26) + +ifelse(0,1,` + li r23, 0 + cmpld cr7, r29, r30 + blt cr7, L(8) + bgt cr7, L(9) + cmpld cr0, r31, r28 + blt cr0, L(8) +L(9): subfc r31, r28, r31 + subfe r29, r30, r29 + li r23, 1 +',` + li r23, 0 + cmpld cr7, r29, r30 + blt cr7, L(8) + mfcr r0 + rlwinm r0, r0, 30, 31, 31 + subfc r9, r28, r31 + addze. r0, r0 + nop + beq cr0, L(8) + subfc r31, r28, r31 + subfe r29, r30, r29 + li r23, 1 +') + +L(8): + add r27, r25, r6 + addic. r27, r27, -3 + blt cr0, L(18) + mr r3, r30 + CALL( mpn_invert_limb) + mulld r10, r3, r30 + mulhdu r0, r3, r28 + addc r8, r10, r28 + subfe r11, r1, r1 + addc r10, r8, r0 + addze. r11, r11 + blt cr0, L(91) +L(40): + subfc r10, r30, r10 + addme. r11, r11 + addi r3, r3, -1 + bge cr0, L(40) +L(91): + addi r5, r27, 1 + mtctr r5 + sldi r0, r27, 3 + add r24, r24, r0 + ALIGN(16) +L(loop): + mulhdu r8, r29, r3 + mulld r6, r29, r3 + addc r6, r6, r31 + adde r8, r8, r29 + cmpd cr7, r27, r25 + mulld r0, r30, r8 + mulhdu r11, r28, r8 + mulld r10, r28, r8 + subf r31, r0, r31 + li r7, 0 + blt cr7, L(60) + ld r7, 0(r26) + addi r26, r26, -8 + nop +L(60): subfc r7, r28, r7 + subfe r31, r30, r31 + subfc r7, r10, r7 + subfe r4, r11, r31 + subfc r9, r6, r4 + subfe r9, r1, r1 + andc r6, r28, r9 + andc r0, r30, r9 + addc r31, r7, r6 + adde r29, r4, r0 + subf r8, r9, r8 + cmpld cr7, r29, r30 + bge- cr7, L(fix) +L(bck): std r8, 0(r24) + addi r24, r24, -8 + addi r27, r27, -1 + bdnz L(loop) +L(18): + std r31, 8(r26) + std r29, 16(r26) + mr r3, r23 + addi r1, r1, 192 + ld r0, 16(r1) + mtlr r0 + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +L(fix): + mfcr r0 + rlwinm r0, r0, 30, 31, 31 + subfc r9, r28, r31 + addze. r0, r0 + beq cr0, L(bck) + subfc r31, r28, r31 + subfe r29, r30, r29 + addi r8, r8, 1 + b L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm new file mode 100644 index 0000000..f9792e5 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/gcd_11.asm @@ -0,0 +1,77 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 ? +C POWER4/PPC970 8.5 obsolete +C POWER5 ? +C POWER6 ? +C POWER7 9.4 obsolete +C POWER8 ? +C POWER9 ? +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +define(`u0', `r3') +define(`v0', `r4') + +define(`mask', `r0')dnl +define(`a1', `r4')dnl +define(`a2', `r5')dnl +define(`d1', `r6')dnl +define(`d2', `r7')dnl +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + li r12, 63 + mr r8, v0 + subf. r10, u0, v0 C r10 = d - a + beq L(end) + + ALIGN(16) +L(top): subfc r11, r8, r3 C r11 = a - d + and d2, r11, r10 + subfe mask, mask, mask + cntlzd cnt, d2 + and a1, r10, mask C d - a + andc a2, r11, mask C a - d + and d1, r3, mask C a + andc d2, r8, mask C d + or r3, a1, a2 C new a + subf cnt, cnt, r12 + or r8, d1, d2 C new d + srd r3, r3, cnt + subf. r10, r3, r8 C r10 = d - a + bne L(top) + +L(end): blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h new file mode 100644 index 0000000..f8305f4 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/gmp-mparam.h @@ -0,0 +1,82 @@ +/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1600MHz PPC970 */ + +/* Generated by tuneup.c, 2009-01-14, gcc 4.0 */ + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 135 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 74 +#define SQR_TOOM4_THRESHOLD 136 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 44 +#define MULLO_MUL_N_THRESHOLD 234 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 33 +#define POWM_THRESHOLD 89 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 93 +#define GCD_DC_THRESHOLD 237 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 1 + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1_THRESHOLD 6 +#define MOD_1_2_THRESHOLD 9 +#define MOD_1_4_THRESHOLD 23 +#define USE_PREINV_DIVREM_1 0 +#define USE_PREINV_MOD_1 0 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */ + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 650 +#define SET_STR_PRECOMPUTE_THRESHOLD 1713 + +#define MUL_FFT_TABLE { 336, 672, 1856, 2816, 7168, 20480, 81920, 327680, 0 } +#define MUL_FFT_MODF_THRESHOLD 304 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_TABLE { 272, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 } +#define SQR_FFT_MODF_THRESHOLD 272 +#define SQR_FFT_THRESHOLD 2688 diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm b/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm new file mode 100644 index 0000000..dfdba64 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/invert_limb.asm @@ -0,0 +1,88 @@ +dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 2004-2006, 2008, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb (approximate) +C POWER3/PPC630 80 +C POWER4/PPC970 86 +C POWER5 86 +C POWER6 170 +C POWER7 66 + +ASM_START() +PROLOGUE(mpn_invert_limb,toc) + LEAL( r12, approx_tab) + srdi r9, r3, 32 + rlwinm r9, r9, 10, 23, 30 C (d >> 55) & 0x1fe + srdi r10, r3, 24 C d >> 24 + lis r11, 0x1000 + rldicl r8, r3, 0, 63 C d mod 2 + addi r10, r10, 1 C d40 + sldi r11, r11, 32 C 2^60 + srdi r7, r3, 1 C d/2 + add r7, r7, r8 C d63 = ceil(d/2) + neg r8, r8 C mask = -(d mod 2) + lhzx r0, r9, r12 + mullw r9, r0, r0 C v0*v0 + sldi r6, r0, 11 C v0 << 11 + addi r0, r6, -1 C (v0 << 11) - 1 + mulld r9, r9, r10 C v0*v0*d40 + srdi r9, r9, 40 C v0*v0*d40 >> 40 + subf r9, r9, r0 C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 + mulld r0, r9, r10 C v1*d40 + sldi r6, r9, 13 C v1 << 13 + subf r0, r0, r11 C 2^60 - v1*d40 + mulld r0, r0, r9 C v1 * (2^60 - v1*d40) + srdi r0, r0, 47 C v1 * (2^60 - v1*d40) >> 47 + add r0, r0, r6 C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47) + mulld r11, r0, r7 C v2 * d63 + srdi r10, r0, 1 C v2 >> 1 + sldi r9, r0, 31 C v2 << 31 + and r8, r10, r8 C (v2 >> 1) & mask + subf r8, r11, r8 C ((v2 >> 1) & mask) - v2 * d63 + mulhdu r0, r8, r0 C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63) + srdi r0, r0, 1 C p1 >> 1 + add r0, r0, r9 C v3 = (v2 << 31) + (p1 >> 1) + nop + mulld r11, r0, r3 + mulhdu r9, r0, r3 + addc r10, r11, r3 + adde r3, r9, r3 + subf r3, r3, r0 + blr +EPILOGUE() + +DEF_OBJECT(approx_tab) +forloop(i,256,512-1,dnl +` .short eval(0x7fd00/i) +')dnl +END_OBJECT(approx_tab) +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm new file mode 100644 index 0000000..8733730 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_1.asm @@ -0,0 +1,164 @@ +dnl PowerPC-64 mpn_mod_1_1p + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 17 +C POWER5 16 +C POWER6 30 +C POWER7 10.2 + +C TODO +C * Optimise, in particular the cps function. This was compiler-generated and +C then hand optimised. + +C INPUT PARAMETERS +define(`ap', `r3') +define(`n', `r4') +define(`d', `r5') +define(`cps', `r6') + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_mod_1_1p) + sldi r10, r4, 3 + addi r4, r4, -1 + add r3, r3, r10 + ld r0, 16(r6) C B1modb + ld r12, 24(r6) C B2modb + ld r9, -8(r3) + ld r10, -16(r3) + mtctr r4 + mulhdu r8, r9, r0 + mulld r7, r9, r0 + addc r11, r7, r10 + addze r9, r8 + bdz L(end) + + ALIGN(16) +L(top): ld r4, -24(r3) + addi r3, r3, -8 + nop + mulld r10, r11, r0 + mulld r8, r9, r12 + mulhdu r11, r11, r0 + mulhdu r9, r9, r12 + addc r7, r10, r4 + addze r10, r11 + addc r11, r8, r7 + adde r9, r9, r10 + bdnz L(top) + +L(end): +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` lwz r0, 8(r6)', +` lwz r0, 12(r6)') + ld r3, 0(r6) + cmpdi cr7, r0, 0 + beq- cr7, L(4) + subfic r10, r0, 64 + sld r9, r9, r0 + srd r10, r11, r10 + or r9, r10, r9 +L(4): subfc r10, r5, r9 + subfe r10, r10, r10 + nand r10, r10, r10 + sld r11, r11, r0 + and r10, r10, r5 + subf r9, r10, r9 + mulhdu r10, r9, r3 + mulld r3, r9, r3 + addi r9, r9, 1 + addc r8, r3, r11 + adde r3, r10, r9 + mulld r3, r3, r5 + subf r3, r3, r11 + cmpld cr7, r8, r3 + bge cr7, L(5) C FIXME: Make branch-less + add r3, r3, r5 +L(5): cmpld cr7, r3, r5 + bge- cr7, L(10) + srd r3, r3, r0 + blr + +L(10): subf r3, r5, r3 + srd r3, r3, r0 + blr +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps,toc) + mflr r0 + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + cntlzd r31, r4 + std r0, 16(r1) + extsw r31, r31 + mr r29, r3 + stdu r1, -144(r1) + sld r30, r4, r31 + mr r3, r30 + CALL( mpn_invert_limb) + cmpdi cr7, r31, 0 + neg r0, r30 + beq- cr7, L(13) + subfic r11, r31, 64 + li r0, 1 + neg r9, r30 + srd r11, r3, r11 + sld r0, r0, r31 + or r0, r11, r0 + mulld r0, r0, r9 +L(13): mulhdu r9, r0, r3 + mulld r11, r0, r3 + add r9, r0, r9 + nor r9, r9, r9 + mulld r9, r9, r30 + cmpld cr7, r11, r9 + bge cr7, L(14) + add r9, r9, r30 +L(14): addi r1, r1, 144 + srd r0, r0, r31 + std r31, 8(r29) + std r3, 0(r29) + std r0, 16(r29) + ld r0, 16(r1) + srd r9, r9, r31 + ld r30, -16(r1) + ld r31, -8(r1) + std r9, 24(r29) + ld r29, -24(r1) + mtlr r0 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm new file mode 100644 index 0000000..0b7d6bf --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_1_4.asm @@ -0,0 +1,270 @@ +dnl PowerPC-64 mpn_mod_1s_4p + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 9 +C POWER5 9 +C POWER6 13 +C POWER7 3.5 + +C TODO +C * Optimise, in particular the cps function. This was compiler-generated and +C then hand optimised. + +C INPUT PARAMETERS +define(`ap', `r3') +define(`n', `r4') +define(`d', `r5') +define(`cps', `r6') + +ASM_START() + +EXTERN_FUNC(mpn_invert_limb) + +PROLOGUE(mpn_mod_1s_4p) + std r23, -72(r1) + ld r23, 48(cps) + std r24, -64(r1) + std r25, -56(r1) + ld r24, 32(cps) + ld r25, 24(cps) + std r26, -48(r1) + std r27, -40(r1) + ld r26, 16(cps) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + ld r30, 40(cps) + + rldicl. r0, n, 0,62 + sldi r31, n, 3 + add ap, ap, r31 C make ap point at end of operand + + cmpdi cr7, r0, 2 + beq cr0, L(b00) + blt cr7, L(b01) + beq cr7, L(b10) + +L(b11): ld r11, -16(ap) + ld r9, -8(ap) + ld r0, -24(ap) + mulhdu r27, r11, r26 + mulld r8, r11, r26 + mulhdu r11, r9, r25 + mulld r9, r9, r25 + addc r31, r8, r0 + addze r10, r27 + addc r0, r9, r31 + adde r9, r11, r10 + addi ap, ap, -40 + b L(6) + + ALIGN(16) +L(b00): ld r11, -24(ap) + ld r10, -16(ap) + ld r9, -8(ap) + ld r0, -32(ap) + mulld r8, r11, r26 + mulhdu r7, r10, r25 + mulhdu r27, r11, r26 + mulhdu r11, r9, r24 + mulld r10, r10, r25 + mulld r9, r9, r24 + addc r31, r8, r0 + addze r0, r27 + addc r8, r31, r10 + adde r10, r0, r7 + addc r0, r9, r8 + adde r9, r11, r10 + addi ap, ap, -48 + b L(6) + + ALIGN(16) +L(b01): li r9, 0 + ld r0, -8(ap) + addi ap, ap, -24 + b L(6) + + ALIGN(16) +L(b10): ld r9, -8(ap) + ld r0, -16(ap) + addi ap, ap, -32 + + ALIGN(16) +L(6): addi r10, n, 3 + srdi r7, r10, 2 + mtctr r7 + bdz L(end) + + ALIGN(16) +L(top): ld r31, -16(ap) + ld r10, -8(ap) + ld r11, 8(ap) + ld r12, 0(ap) + mulld r29, r0, r30 C rl * B4modb + mulhdu r0, r0, r30 C rl * B4modb + mulhdu r27, r10, r26 + mulld r10, r10, r26 + mulhdu r7, r9, r23 C rh * B5modb + mulld r9, r9, r23 C rh * B5modb + mulhdu r28, r11, r24 + mulld r11, r11, r24 + mulhdu r4, r12, r25 + mulld r12, r12, r25 + addc r8, r10, r31 + addze r10, r27 + addi ap, ap, -32 + addc r27, r8, r12 + adde r12, r10, r4 + addc r11, r27, r11 + adde r31, r12, r28 + addc r12, r11, r29 + adde r4, r31, r0 + addc r0, r9, r12 + adde r9, r7, r4 + bdnz L(top) + +L(end): +ifdef(`HAVE_LIMB_LITTLE_ENDIAN', +` lwz r3, 8(cps)', +` lwz r3, 12(cps)') + mulld r10, r9, r26 + mulhdu r9, r9, r26 + addc r11, r0, r10 + addze r9, r9 + ld r10, 0(cps) + subfic r8, r3, 64 + sld r9, r9, r3 + srd r8, r11, r8 + sld r11, r11, r3 + or r9, r8, r9 + mulld r0, r9, r10 + mulhdu r10, r9, r10 + addi r9, r9, 1 + addc r8, r0, r11 + adde r0, r10, r9 + mulld r0, r0, d + subf r0, r0, r11 + cmpld cr7, r8, r0 + bge cr7, L(9) + add r0, r0, d +L(9): cmpld cr7, r0, d + bge- cr7, L(16) +L(10): srd r3, r0, r3 + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr + +L(16): subf r0, d, r0 + b L(10) +EPILOGUE() + +PROLOGUE(mpn_mod_1s_4p_cps,toc) + mflr r0 + std r29, -24(r1) + std r30, -16(r1) + mr r29, r3 + std r0, 16(r1) + std r31, -8(r1) + stdu r1, -144(r1) + cntlzd r31, r4 + sld r30, r4, r31 + mr r3, r30 + CALL( mpn_invert_limb) + subfic r9, r31, 64 + li r10, 1 + sld r10, r10, r31 + srd r9, r3, r9 + neg r0, r30 + or r10, r10, r9 + mulld r10, r10, r0 + mulhdu r11, r10, r3 + nor r11, r11, r11 + subf r11, r10, r11 + mulld r11, r11, r30 + mulld r0, r10, r3 + cmpld cr7, r0, r11 + bge cr7, L(18) + add r11, r11, r30 +L(18): mulhdu r9, r11, r3 + add r9, r11, r9 + nor r9, r9, r9 + mulld r9, r9, r30 + mulld r0, r11, r3 + cmpld cr7, r0, r9 + bge cr7, L(19) + add r9, r9, r30 +L(19): mulhdu r0, r9, r3 + add r0, r9, r0 + nor r0, r0, r0 + mulld r0, r0, r30 + mulld r8, r9, r3 + cmpld cr7, r8, r0 + bge cr7, L(20) + add r0, r0, r30 +L(20): mulhdu r8, r0, r3 + add r8, r0, r8 + nor r8, r8, r8 + mulld r8, r8, r30 + mulld r7, r0, r3 + cmpld cr7, r7, r8 + bge cr7, L(21) + add r8, r8, r30 +L(21): srd r0, r0, r31 + addi r1, r1, 144 + srd r8, r8, r31 + srd r10, r10, r31 + srd r11, r11, r31 + std r0, 40(r29) + std r31, 8(r29) + srd r9, r9, r31 + ld r0, 16(r1) + ld r30, -16(r1) + std r8, 48(r29) + std r3, 0(r29) + mtlr r0 + ld r31, -8(r1) + std r10, 16(r29) + std r11, 24(r29) + std r9, 32(r29) + ld r29, -24(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm new file mode 100644 index 0000000..c35e0e3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mod_34lsub1.asm @@ -0,0 +1,132 @@ +dnl PowerPC-64 mpn_mod_34lsub1 -- modulo 2^48-1. + +dnl Copyright 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.33 +C POWER4/PPC970 1.5 +C POWER5 1.32 +C POWER6 2.35 +C POWER7 1 + +C INPUT PARAMETERS +define(`up',`r3') +define(`n',`r4') + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + li r8, 0 + li r9, 0 + li r10, 0 + li r11, 0 + + cmpdi cr6, n, 3 + blt cr6, L(lt3) + + li r0, -0x5556 C 0xFFFFFFFFFFFFAAAA + rldimi r0, r0, 16, 32 C 0xFFFFFFFFAAAAAAAA + rldimi r0, r0, 32, 63 C 0xAAAAAAAAAAAAAAAB + mulhdu r0, r0, n + srdi r0, r0, 1 C r0 = [n / 3] + mtctr r0 + + ld r5, 0(up) + ld r6, 8(up) + ld r7, 16(up) + addi up, up, 24 + bdz L(end) + + ALIGN(16) +L(top): addc r8, r8, r5 + nop + ld r5, 0(up) + adde r9, r9, r6 + ld r6, 8(up) + adde r10, r10, r7 + ld r7, 16(up) + addi up, up, 48 + addze r11, r11 + bdz L(endx) + addc r8, r8, r5 + nop + ld r5, -24(up) + adde r9, r9, r6 + ld r6, -16(up) + adde r10, r10, r7 + ld r7, -8(up) + addze r11, r11 + bdnz L(top) + + addi up, up, 24 +L(endx): + addi up, up, -24 + +L(end): addc r8, r8, r5 + adde r9, r9, r6 + adde r10, r10, r7 + addze r11, r11 + + sldi r5, r0, 1 + add r5, r5, r0 C r11 = n / 3 * 3 + sub n, n, r5 C n = n mod 3 +L(lt3): cmpdi cr6, n, 1 + blt cr6, L(2) + + ld r5, 0(up) + addc r8, r8, r5 + li r6, 0 + beq cr6, L(1) + + ld r6, 8(up) +L(1): adde r9, r9, r6 + addze r10, r10 + addze r11, r11 + +L(2): rldicl r0, r8, 0, 16 C r0 = r8 mod 2^48 + srdi r3, r8, 48 C r3 = r8 div 2^48 + rldic r4, r9, 16, 16 C r4 = (r9 mod 2^32) << 16 + srdi r5, r9, 32 C r5 = r9 div 2^32 + rldic r6, r10, 32, 16 C r6 = (r10 mod 2^16) << 32 + srdi r7, r10, 16 C r7 = r10 div 2^16 + + add r0, r0, r3 + add r4, r4, r5 + add r6, r6, r7 + + add r0, r0, r4 + add r6, r6, r11 + + add r3, r0, r6 + blr +EPILOGUE() + +C |__r10__|__r9___|__r8___| +C |-----|-----|-----|-----| diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm new file mode 100644 index 0000000..726339a --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mode1o.asm @@ -0,0 +1,117 @@ +dnl PowerPC-64 mpn_modexact_1_odd -- mpn by limb exact remainder. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 +C POWER6 ? +C POWER7 12 + +C TODO +C * Check if n=1 code is really an improvement. It probably isn't. +C * Make more similar to dive_1.asm. + +C INPUT PARAMETERS +define(`up', `r3') +define(`n', `r4') +define(`d', `r5') +define(`cy', `r6') + + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_modexact_1c_odd,toc) + addic. n, n, -1 C set carry as side effect + ld r8, 0(up) + bne cr0, L(2) + cmpld cr7, r6, r8 + bge cr7, L(4) + subf r8, r6, r8 + divdu r3, r8, d + mulld r3, r3, d + subf. r3, r3, r8 + beqlr cr0 + subf r3, r3, d + blr + +L(4): subf r3, r8, r6 + divdu r8, r3, d + mulld r8, r8, d + subf r3, r8, r3 + blr + +L(2): LEA( r7, binvert_limb_table) + rldicl r9, d, 63, 57 + mtctr n + lbzx r0, r7, r9 + mulld r7, r0, r0 + sldi r0, r0, 1 + mulld r7, d, r7 + subf r0, r7, r0 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r0, r9, r0 + mulld r7, r0, r0 + sldi r0, r0, 1 + mulld r7, d, r7 + subf r9, r7, r0 + + ALIGN(16) +L(loop): + subfe r0, r6, r8 + ld r8, 8(up) + addi up, up, 8 + mulld r0, r9, r0 + mulhdu r6, r0, d + bdnz L(loop) + + cmpld cr7, d, r8 + blt cr7, L(10) + + subfe r0, r0, r0 + subf r6, r0, r6 + cmpld cr7, r6, r8 + subf r3, r8, r6 + bgelr cr7 + add r3, d, r3 + blr + +L(10): subfe r0, r6, r8 + mulld r0, r9, r0 + mulhdu r3, r0, d + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm new file mode 100644 index 0000000..27a8f8f --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mul_1.asm @@ -0,0 +1,168 @@ +dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 7.25? not updated for last file revision +C POWER5 7.25 +C POWER6 14 +C POWER7 2.9 + +C TODO +C * Try to reduce the number of needed live registers (at least r5 and r10 +C could be combined) +C * Optimize feed-in code, for speed and size. +C * Clean up r12/r7 usage in feed-in code. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`vl', `r6') + +ASM_START() +PROLOGUE(mpn_mul_1c) + std r27, -40(r1) + std r26, -48(r1) + mr r12, r7 + b L(ent) +EPILOGUE() +PROLOGUE(mpn_mul_1) + std r27, -40(r1) + std r26, -48(r1) + li r12, 0 C cy_limb = 0 +L(ent): ld r26, 0(up) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addic n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + beq cr6, L(b10) + +L(b11): mr r7, r12 + mulld r0, r26, r6 + mulhdu r12, r26, r6 + addi up, up, 8 + addc r0, r0, r7 + std r0, 0(rp) + addi rp, rp, 8 + b L(fic) + +L(b00): ld r27, 8(up) + addi up, up, 16 + mulld r0, r26, r6 + mulhdu r5, r26, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r0, r0, r12 + adde r7, r7, r5 + addze r12, r8 + std r0, 0(rp) + std r7, 8(rp) + addi rp, rp, 16 + b L(fic) + + nop C alignment +L(b01): bdnz L(gt1) + mulld r0, r26, r6 + mulhdu r8, r26, r6 + addc r0, r0, r12 + std r0, 0(rp) + b L(ret) +L(gt1): ld r27, 8(up) + nop + mulld r0, r26, r6 + mulhdu r5, r26, r6 + ld r26, 16(up) + mulld r7, r27, r6 + mulhdu r8, r27, r6 + mulld r9, r26, r6 + mulhdu r10, r26, r6 + addc r0, r0, r12 + adde r7, r7, r5 + adde r9, r9, r8 + addze r12, r10 + std r0, 0(rp) + std r7, 8(rp) + std r9, 16(rp) + addi up, up, 24 + addi rp, rp, 24 + b L(fic) + + nop +L(fic): ld r26, 0(up) +L(b10): ld r27, 8(up) + addi up, up, 16 + bdz L(end) + +L(top): mulld r0, r26, r6 + mulhdu r5, r26, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r26, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r5 + mulld r9, r26, r6 + mulhdu r10, r26, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r26, 16(up) + ld r27, 24(up) + std r0, 0(rp) + adde r9, r9, r8 + std r7, 8(rp) + adde r11, r11, r10 + std r9, 16(rp) + addi up, up, 32 + std r11, 24(rp) + + addi rp, rp, 32 + bdnz L(top) + +L(end): mulld r0, r26, r6 + mulhdu r5, r26, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r5 + std r0, 0(rp) + std r7, 8(rp) +L(ret): addze r3, r8 + ld r27, -40(r1) + ld r26, -48(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm new file mode 100644 index 0000000..1873187 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/mul_basecase.asm @@ -0,0 +1,708 @@ +dnl PowerPC-64 mpn_mul_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 24 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') +define(`vp', `r6') +define(`vn', `r7') + +define(`v0', `r25') +define(`outer_rp', `r22') +define(`outer_up', `r23') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + +C Special code for un <= 2, for efficiency of these important cases, +C and since it simplifies the default code. + cmpdi cr0, un, 2 + bgt cr0, L(un_gt2) + cmpdi cr6, vn, 1 + ld r7, 0(vp) + ld r5, 0(up) + mulld r8, r5, r7 C weight 0 + mulhdu r9, r5, r7 C weight 1 + std r8, 0(rp) + beq cr0, L(2x) + std r9, 8(rp) + blr + ALIGN(16) +L(2x): ld r0, 8(up) + mulld r8, r0, r7 C weight 1 + mulhdu r10, r0, r7 C weight 2 + addc r9, r9, r8 + addze r10, r10 + bne cr6, L(2x2) + std r9, 8(rp) + std r10, 16(rp) + blr + ALIGN(16) +L(2x2): ld r6, 8(vp) + nop + mulld r8, r5, r6 C weight 1 + mulhdu r11, r5, r6 C weight 2 + addc r9, r9, r8 + std r9, 8(rp) + adde r11, r11, r10 + mulld r12, r0, r6 C weight 2 + mulhdu r0, r0, r6 C weight 3 + addze r0, r0 + addc r11, r11, r12 + addze r0, r0 + std r11, 16(rp) + std r0, 24(rp) + blr + +L(un_gt2): + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + + mr outer_rp, rp + mr outer_up, up + + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 0(up) + + rldicl. r0, un, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi un, un, 1 C compute count... + srdi un, un, 2 C ...for ctr + mtctr un C copy inner loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + + + ALIGN(16) +L(b3): mulld r0, r26, v0 + mulhdu r12, r26, v0 + addic r0, r0, 0 + std r0, 0(rp) + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_m_3) + + ALIGN(16) +L(lo_m_3): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 24(up) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r27, 32(up) + nop + adde r0, r0, r12 + adde r24, r24, r31 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + ld r26, 40(up) + nop + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ld r27, 48(up) + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r10 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(lo_m_3) + + ALIGN(16) +L(end_m_3): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + + mulld r24, r27, v0 + mulhdu r8, r27, v0 + + adde r0, r0, r12 + adde r24, r24, r31 + + std r0, 8(rp) + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + addic. vn, vn, -1 + beq L(ret) + + ALIGN(16) +L(outer_lo_3): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 8 + mr up, outer_up + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 0(up) + ld r28, 0(rp) + mulld r0, r26, v0 + mulhdu r12, r26, v0 + addc r0, r0, r28 + std r0, 0(rp) + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_3) + + ALIGN(16) C registers dying +L(lo_3): + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 24(up) C + ld r28, 8(rp) C + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + ld r27, 32(up) C + ld r29, 16(rp) C + adde r0, r0, r12 C 0 12 + adde r24, r24, r10 C 24 10 + mulld r9, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 40(up) C + ld r30, 24(rp) C + mulld r11, r27, v0 C + mulhdu r12, r27, v0 C 27 + ld r27, 48(up) C + ld r31, 32(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 8(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, 16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, 24(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 32(rp) C 11 + addi up, up, 32 C + addi rp, rp, 32 C + bdnz L(lo_3) C + + ALIGN(16) +L(end_3): + mulld r0, r26, v0 + mulhdu r10, r26, v0 + ld r28, 8(rp) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, 16(rp) + nop + adde r0, r0, r12 + adde r24, r24, r10 + addze r8, r8 + addc r0, r0, r28 + std r0, 8(rp) + adde r24, r24, r29 + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + + addic. vn, vn, -1 + bne L(outer_lo_3) + b L(ret) + + + ALIGN(16) +L(b0): ld r27, 8(up) + addi up, up, 8 + mulld r0, r26, v0 + mulhdu r10, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + addc r24, r24, r10 + addze r12, r8 + std r0, 0(rp) + std r24, 8(rp) + addi rp, rp, 8 + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_m_0) + + ALIGN(16) +L(lo_m_0): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 24(up) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r27, 32(up) + nop + adde r0, r0, r12 + adde r24, r24, r31 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + ld r26, 40(up) + nop + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ld r27, 48(up) + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r10 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(lo_m_0) + + ALIGN(16) +L(end_m_0): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + + mulld r24, r27, v0 + mulhdu r8, r27, v0 + + adde r0, r0, r12 + adde r24, r24, r31 + + std r0, 8(rp) + addze r8, r8 + std r24, 16(rp) + addic. vn, vn, -1 + std r8, 24(rp) + nop + beq L(ret) + + ALIGN(16) +L(outer_lo_0): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 16 + addi up, outer_up, 8 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, -8(up) + ld r27, 0(up) + ld r28, -8(rp) + ld r29, 0(rp) + nop + nop + mulld r0, r26, v0 + mulhdu r10, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + addc r24, r24, r10 + addze r12, r8 + addc r0, r0, r28 + std r0, -8(rp) + adde r24, r24, r29 + std r24, 0(rp) + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_0) + + ALIGN(16) C registers dying +L(lo_0): + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 24(up) C + ld r28, 8(rp) C + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + ld r27, 32(up) C + ld r29, 16(rp) C + adde r0, r0, r12 C 0 12 + adde r24, r24, r10 C 24 10 + mulld r9, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 40(up) C + ld r30, 24(rp) C + mulld r11, r27, v0 C + mulhdu r12, r27, v0 C 27 + ld r27, 48(up) C + ld r31, 32(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 8(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, 16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, 24(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 32(rp) C 11 + addi up, up, 32 C + addi rp, rp, 32 C + bdnz L(lo_0) C + + ALIGN(16) +L(end_0): + mulld r0, r26, v0 + mulhdu r10, r26, v0 + ld r28, 8(rp) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, 16(rp) + nop + adde r0, r0, r12 + adde r24, r24, r10 + addze r8, r8 + addic. vn, vn, -1 + addc r0, r0, r28 + std r0, 8(rp) + adde r24, r24, r29 + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + bne L(outer_lo_0) + b L(ret) + + + ALIGN(16) +L(b1): ld r27, 8(up) + nop + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 16(up) + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + addc r24, r24, r31 + adde r9, r9, r8 + addze r12, r10 + std r0, 0(rp) + std r24, 8(rp) + std r9, 16(rp) + addi up, up, 16 + addi rp, rp, 16 + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_m_1) + + ALIGN(16) +L(lo_m_1): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 24(up) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r27, 32(up) + nop + adde r0, r0, r12 + adde r24, r24, r31 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + ld r26, 40(up) + nop + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ld r27, 48(up) + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r10 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(lo_m_1) + + ALIGN(16) +L(end_m_1): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + + mulld r24, r27, v0 + mulhdu r8, r27, v0 + + adde r0, r0, r12 + adde r24, r24, r31 + + std r0, 8(rp) + addze r8, r8 + std r24, 16(rp) + addic. vn, vn, -1 + std r8, 24(rp) + nop + beq L(ret) + + ALIGN(16) +L(outer_lo_1): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 24 + addi up, outer_up, 16 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, -16(up) + ld r27, -8(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 0(up) + ld r28, -16(rp) + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, -8(rp) + ld r30, 0(rp) + mulld r9, r26, v0 + mulhdu r10, r26, v0 + addc r24, r24, r31 + adde r9, r9, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, -16(rp) + adde r24, r24, r29 + std r24, -8(rp) + adde r9, r9, r30 + std r9, 0(rp) + ld r26, 8(up) + ld r27, 16(up) + bdz L(end_1) + + ALIGN(16) C registers dying +L(lo_1): + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 24(up) C + ld r28, 8(rp) C + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + ld r27, 32(up) C + ld r29, 16(rp) C + adde r0, r0, r12 C 0 12 + adde r24, r24, r10 C 24 10 + mulld r9, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 40(up) C + ld r30, 24(rp) C + mulld r11, r27, v0 C + mulhdu r12, r27, v0 C 27 + ld r27, 48(up) C + ld r31, 32(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 8(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, 16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, 24(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 32(rp) C 11 + addi up, up, 32 C + addi rp, rp, 32 C + bdnz L(lo_1) C + + ALIGN(16) +L(end_1): + mulld r0, r26, v0 + mulhdu r10, r26, v0 + ld r28, 8(rp) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, 16(rp) + nop + adde r0, r0, r12 + adde r24, r24, r10 + addze r8, r8 + addic. vn, vn, -1 + addc r0, r0, r28 + std r0, 8(rp) + adde r24, r24, r29 + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + bne L(outer_lo_1) + b L(ret) + + + ALIGN(16) +L(b2): ld r27, 8(up) + addi up, up, -8 + addi rp, rp, -8 + li r12, 0 + addic r12, r12, 0 + + ALIGN(16) +L(lo_m_2): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + ld r26, 24(up) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r27, 32(up) + nop + adde r0, r0, r12 + adde r24, r24, r31 + mulld r9, r26, v0 + mulhdu r10, r26, v0 + ld r26, 40(up) + nop + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ld r27, 48(up) + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r10 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + + addi rp, rp, 32 + bdnz L(lo_m_2) + + ALIGN(16) +L(end_m_2): + mulld r0, r26, v0 + mulhdu r31, r26, v0 + + mulld r24, r27, v0 + mulhdu r8, r27, v0 + + adde r0, r0, r12 + adde r24, r24, r31 + + std r0, 8(rp) + addze r8, r8 + std r24, 16(rp) + addic. vn, vn, -1 + std r8, 24(rp) + nop + beq L(ret) + + ALIGN(16) +L(outer_lo_2): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 0 + addi up, outer_up, -8 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 8(up) + ld r27, 16(up) + li r12, 0 + addic r12, r12, 0 + + ALIGN(16) C registers dying +L(lo_2): + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 24(up) C + ld r28, 8(rp) C + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + ld r27, 32(up) C + ld r29, 16(rp) C + adde r0, r0, r12 C 0 12 + adde r24, r24, r10 C 24 10 + mulld r9, r26, v0 C + mulhdu r10, r26, v0 C 26 + ld r26, 40(up) C + ld r30, 24(rp) C + mulld r11, r27, v0 C + mulhdu r12, r27, v0 C 27 + ld r27, 48(up) C + ld r31, 32(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 8(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, 16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, 24(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 32(rp) C 11 + addi up, up, 32 C + addi rp, rp, 32 C + bdnz L(lo_2) C + + ALIGN(16) +L(end_2): + mulld r0, r26, v0 + mulhdu r10, r26, v0 + ld r28, 8(rp) + nop + mulld r24, r27, v0 + mulhdu r8, r27, v0 + ld r29, 16(rp) + nop + adde r0, r0, r12 + adde r24, r24, r10 + addze r8, r8 + addic. vn, vn, -1 + addc r0, r0, r28 + std r0, 8(rp) + adde r24, r24, r29 + std r24, 16(rp) + addze r8, r8 + std r8, 24(rp) + bne L(outer_lo_2) + b L(ret) + + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h new file mode 100644 index 0000000..61a437b --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p3/gmp-mparam.h @@ -0,0 +1,179 @@ +/* POWER3/PowerPC630 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 33 +#define MUL_TOOM44_THRESHOLD 46 +#define MUL_TOOM6H_THRESHOLD 77 +#define MUL_TOOM8H_THRESHOLD 139 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 47 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 49 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 49 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 34 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 14 +#define SQR_TOOM3_THRESHOLD 45 +#define SQR_TOOM4_THRESHOLD 64 +#define SQR_TOOM6_THRESHOLD 85 +#define SQR_TOOM8_THRESHOLD 139 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 8 +#define SQRMOD_BNM1_THRESHOLD 10 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \ + { 7, 7}, { 15, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 23,10}, { 15, 9}, \ + { 35, 8}, { 71,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 71, 9}, { 143, 8}, { 287,10}, \ + { 79,11}, { 47,10}, { 95, 9}, { 191,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287,11}, { 79,10}, \ + { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \ + { 351,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,10}, { 223,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 223,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 287,10}, \ + { 575, 9}, { 1151,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,12}, { 223,11}, { 447,10}, { 895,13}, \ + { 127,12}, { 255,11}, { 511,12}, { 287,11}, \ + { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 120 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 188 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 188, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \ + { 13, 7}, { 13, 8}, { 7, 7}, { 16, 8}, \ + { 9, 7}, { 19, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 23,10}, { 7, 9}, \ + { 15, 8}, { 31, 9}, { 19, 8}, { 39, 9}, \ + { 23,10}, { 15, 9}, { 39,10}, { 23,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79, 8}, { 159,10}, { 47, 9}, { 95, 8}, \ + { 191,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \ + { 79, 9}, { 159,11}, { 47,10}, { 95, 9}, \ + { 191,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511,10}, { 143, 9}, { 287,11}, \ + { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,10}, { 223,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,11}, { 223,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 767,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,12}, { 287,11}, { 575,10}, { 1151,12}, \ + { 319,11}, { 639,12}, { 351,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 447,11}, { 895,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 118 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 2 +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 2511 + +#define DC_DIV_QR_THRESHOLD 23 +#define DC_DIVAPPR_Q_THRESHOLD 87 +#define DC_BDIV_QR_THRESHOLD 27 +#define DC_BDIV_Q_THRESHOLD 60 + +#define INV_MULMOD_BNM1_THRESHOLD 27 +#define INV_NEWTON_THRESHOLD 91 +#define INV_APPR_THRESHOLD 91 + +#define BINV_NEWTON_THRESHOLD 115 +#define REDC_1_TO_REDC_N_THRESHOLD 31 + +#define MU_DIV_QR_THRESHOLD 551 +#define MU_DIVAPPR_Q_THRESHOLD 551 +#define MUPI_DIV_QR_THRESHOLD 42 +#define MU_BDIV_QR_THRESHOLD 483 +#define MU_BDIV_Q_THRESHOLD 492 + +#define POWM_SEC_TABLE 2,23,140,556,713,746 + +#define MATRIX22_STRASSEN_THRESHOLD 8 +#define HGCD_THRESHOLD 56 +#define HGCD_APPR_THRESHOLD 51 +#define HGCD_REDUCE_THRESHOLD 688 +#define GCD_DC_THRESHOLD 333 +#define GCDEXT_DC_THRESHOLD 126 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 17 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 375 +#define SET_STR_PRECOMPUTE_THRESHOLD 812 + +#define FAC_DSC_THRESHOLD 351 +#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h new file mode 100644 index 0000000..3c40fb9 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p4/gmp-mparam.h @@ -0,0 +1,214 @@ +/* POWER4/PowerPC970 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2008-2010, 2014, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1800 MHz PPC970 */ +/* FFT tuning limit = 15 M */ +/* Generated by tuneup.c, 2015-10-09, gcc 4.0 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 35 + +#define DIV_1_VS_MUL_1_PERCENT 218 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 136 +#define MUL_TOOM6H_THRESHOLD 197 +#define MUL_TOOM8H_THRESHOLD 272 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 90 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 76 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 430 + +#define MULMID_TOOM42_THRESHOLD 34 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 13, 5}, { 28, 6}, \ + { 19, 7}, { 10, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 55,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 9}, { 127,10}, { 87,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 135, 9}, { 271,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 167,11}, { 95, 9}, { 383, 8}, \ + { 767,10}, { 199,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,12}, \ + { 95,10}, { 383, 9}, { 767,10}, { 415, 9}, \ + { 831,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,10}, { 895,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 895,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 703,14}, { 383,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,14}, { 767,13}, \ + { 1663,14}, { 895,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 159 +#define MUL_FFT_THRESHOLD 9088 + +#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 344, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 13, 5}, { 28, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 14, 6}, \ + { 29, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 135, 9}, { 271, 8}, { 543,11}, \ + { 79, 9}, { 319, 8}, { 639,11}, { 95,10}, \ + { 191, 9}, { 383, 8}, { 767,10}, { 207, 9}, \ + { 415,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \ + { 639,10}, { 335,11}, { 175,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,10}, \ + { 607,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,10}, { 1663,11}, { 895,12}, { 479,14}, \ + { 127,13}, { 255,12}, { 607,13}, { 319,12}, \ + { 703,13}, { 383,12}, { 831,11}, { 1663,12}, \ + { 927,14}, { 255,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \ + { 703,14}, { 383,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1407,14}, { 767,13}, { 1663,14}, \ + { 895,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 174 +#define SQR_FFT_THRESHOLD 6272 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 43 +#define MULLO_MUL_N_THRESHOLD 18087 +#define SQRLO_BASECASE_THRESHOLD 2 +#define SQRLO_DC_THRESHOLD 79 +#define SQRLO_SQR_THRESHOLD 12322 + +#define DC_DIV_QR_THRESHOLD 42 +#define DC_DIVAPPR_Q_THRESHOLD 159 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 26 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 165 + +#define BINV_NEWTON_THRESHOLD 198 +#define REDC_1_TO_REDC_N_THRESHOLD 56 + +#define MU_DIV_QR_THRESHOLD 1017 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 90 +#define MU_BDIV_QR_THRESHOLD 924 +#define MU_BDIV_Q_THRESHOLD 1017 + +#define POWM_SEC_TABLE 7,17,86,579,1925 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_DC_THRESHOLD 788 +#define SET_STR_PRECOMPUTE_THRESHOLD 1713 + +#define FAC_DSC_THRESHOLD 512 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 10 +#define HGCD_THRESHOLD 113 +#define HGCD_APPR_THRESHOLD 115 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 330 +#define GCDEXT_DC_THRESHOLD 242 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h new file mode 100644 index 0000000..15b009c --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p5/gmp-mparam.h @@ -0,0 +1,219 @@ +/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* POWER5 (friggms.hpc.ntnu.no) */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 40 + +#define MUL_TOOM22_THRESHOLD 21 +#define MUL_TOOM33_THRESHOLD 24 +#define MUL_TOOM44_THRESHOLD 70 +#define MUL_TOOM6H_THRESHOLD 262 +#define MUL_TOOM8H_THRESHOLD 393 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 49 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 70 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 81 +#define SQR_TOOM4_THRESHOLD 142 +#define SQR_TOOM6_THRESHOLD 189 +#define SQR_TOOM8_THRESHOLD 284 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 304 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 319,12}, \ + { 95,11}, { 191,10}, { 383,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575, 9}, { 1151,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \ + { 287,11}, { 575,10}, { 1151,12}, { 319,11}, \ + { 639,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,10}, { 4863,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 959,12}, \ + { 1919,11}, { 3839,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,12}, { 2431,11}, \ + { 4863,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,12}, { 2815,13}, { 1471,12}, { 2943,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1663,14}, \ + { 895,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3327,14}, { 1919,13}, { 3839,16}, \ + { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,12}, { 11775,15}, { 1535,14}, \ + { 3327,15}, { 1791,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 208 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_MODF_THRESHOLD 284 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 272, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \ + { 31,10}, { 71, 9}, { 143,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143,11}, { 79,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319,11}, { 175,10}, { 351,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,10}, { 959,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,12}, { 287,11}, { 575,12}, \ + { 319,11}, { 639,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,11}, { 895,12}, { 479,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,12}, { 575,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 959,12}, { 1919,15}, { 255,14}, { 511,13}, \ + { 1023,12}, { 2047,13}, { 1087,12}, { 2175,13}, \ + { 1215,14}, { 639,13}, { 1407,12}, { 2815,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1663,13}, { 3327,14}, { 1919,13}, \ + { 3839,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3327,15}, { 1791,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 190 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 6 +#define MULLO_DC_THRESHOLD 60 +#define MULLO_MUL_N_THRESHOLD 7463 + +#define DC_DIV_QR_THRESHOLD 58 +#define DC_DIVAPPR_Q_THRESHOLD 232 +#define DC_BDIV_QR_THRESHOLD 78 +#define DC_BDIV_Q_THRESHOLD 238 + +#define INV_MULMOD_BNM1_THRESHOLD 92 +#define INV_NEWTON_THRESHOLD 155 +#define INV_APPR_THRESHOLD 157 + +#define BINV_NEWTON_THRESHOLD 155 +#define REDC_1_TO_REDC_N_THRESHOLD 61 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 79 +#define MU_BDIV_QR_THRESHOLD 823 +#define MU_BDIV_Q_THRESHOLD 942 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD_THRESHOLD 74 +#define HGCD_APPR_THRESHOLD 155 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 351 +#define GCDEXT_DC_THRESHOLD 288 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 650 +#define SET_STR_PRECOMPUTE_THRESHOLD 1585 + +#define FAC_DSC_THRESHOLD 662 +#define FAC_ODD_THRESHOLD 28 diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm new file mode 100644 index 0000000..c572b91 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/aorsmul_1.asm @@ -0,0 +1,185 @@ +dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 ? ? +C POWER4/PPC970 ? ? +C POWER5 ? ? +C POWER6 12.25 12.8 +C POWER7 ? ? + +C TODO +C * Reduce register usage. +C * Schedule function entry code. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * Handle n = 1 and perhaps n = 2 separately, without saving any registers. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ifdef(`OPERATION_addmul_1',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_addmul_1) + define(func_nc, mpn_addmul_1c) C FIXME: not really supported + define(AM, `$1') + define(SM, `') + define(CLRRSC, `addic $1, r0, 0') +') +ifdef(`OPERATION_submul_1',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_submul_1) + define(func_nc, mpn_submul_1c) C FIXME: not really supported + define(AM, `') + define(SM, `$1') + define(CLRRSC, `subfc $1, r0, r0') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 3 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C copy loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + +L(b3): ld r8, 0(up) + ld r7, 8(up) + ld r27, 16(up) + addi up, up, 16 + addi rp, rp, 16 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r29, -16(rp) + ld r30, -8(rp) + ld r31, 0(rp) + addc r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r5, r5, r29 + b L(l3) + +L(b2): ld r7, 0(up) + ld r27, 8(up) + addi up, up, 8 + addi rp, rp, 8 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r30, -8(rp) + ld r31, 0(rp) + addc r11, r11, r7 + addze r12, r27 + ADDSUB r9, r9, r30 + b L(l2) + +L(b1): ld r27, 0(up) + ld r31, 0(rp) + mulld r11, r27, v0 + mulhdu r12, r27, v0 + ADDSUB r11, r11, r31 + b L(l1) + +L(b0): addi up, up, -8 + addi rp, rp, -8 + CLRRSC( r12) C clear r12 and clr/set cy + + ALIGN(32) +L(top): +SM(` subfe r11, r0, r0') C complement... +SM(` addic r11, r11, 1') C ...carry flag + ld r10, 8(up) + ld r8, 16(up) + ld r7, 24(up) + ld r27, 32(up) + addi up, up, 32 + addi rp, rp, 32 + mulld r0, r10, v0 + mulhdu r10, r10, v0 + mulld r5, r8, v0 + mulhdu r8, r8, v0 + mulld r9, r7, v0 + mulhdu r7, r7, v0 + mulld r11, r27, v0 + mulhdu r27, r27, v0 + ld r28, -24(rp) + adde r0, r0, r12 + ld r29, -16(rp) + adde r5, r5, r10 + ld r30, -8(rp) + ld r31, 0(rp) + adde r9, r9, r8 + adde r11, r11, r7 + addze r12, r27 + ADDSUB r0, r0, r28 + std r0, -24(rp) + ADDSUBC r5, r5, r29 +L(l3): std r5, -16(rp) + ADDSUBC r9, r9, r30 +L(l2): std r9, -8(rp) + ADDSUBC r11, r11, r31 +L(l1): std r11, 0(rp) + bdnz L(top) + +AM(` addze r3, r12') +SM(` subfe r11, r0, r0') C complement... + ld r31, -8(r1) +SM(` subf r3, r11, r12') + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h new file mode 100644 index 0000000..c7e2f89 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/gmp-mparam.h @@ -0,0 +1,160 @@ +/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2009-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3500 MHz POWER6 (kolga.bibsys.no) */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 6 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 50 +#define MUL_TOOM44_THRESHOLD 106 +#define MUL_TOOM6H_THRESHOLD 274 +#define MUL_TOOM8H_THRESHOLD 339 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 76 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 49 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 226 +#define SQR_TOOM8_THRESHOLD 272 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 14 + +#define MUL_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \ + { 33, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 47,11}, \ + { 31,10}, { 71,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 135, 9}, { 271,11}, { 79, 9}, { 319, 8}, \ + { 639,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207,12}, { 63,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \ + { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 79 +#define MUL_FFT_THRESHOLD 3520 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 280, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 21, 9}, { 11, 8}, { 25, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 47,11}, { 31,10}, { 71, 9}, \ + { 143,11}, { 47,12}, { 31,11}, { 63, 9}, \ + { 255, 8}, { 511, 9}, { 271,10}, { 143,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511, 8}, { 1023,10}, { 271, 9}, { 543,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 80 +#define SQR_FFT_THRESHOLD 2752 + +#define MULLO_BASECASE_THRESHOLD 5 +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 2995 + +#define DC_DIV_QR_THRESHOLD 59 +#define DC_DIVAPPR_Q_THRESHOLD 200 +#define DC_BDIV_QR_THRESHOLD 70 +#define DC_BDIV_Q_THRESHOLD 168 + +#define INV_MULMOD_BNM1_THRESHOLD 53 +#define INV_NEWTON_THRESHOLD 170 +#define INV_APPR_THRESHOLD 166 + +#define BINV_NEWTON_THRESHOLD 220 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 942 +#define MUPI_DIV_QR_THRESHOLD 57 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 1078 + +#define POWM_SEC_TABLE 4,26,216,804,1731 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 106 +#define HGCD_APPR_THRESHOLD 109 +#define HGCD_REDUCE_THRESHOLD 2205 +#define GCD_DC_THRESHOLD 492 +#define GCDEXT_DC_THRESHOLD 327 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 537 +#define SET_STR_PRECOMPUTE_THRESHOLD 1576 + +#define FAC_DSC_THRESHOLD 426 +#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm new file mode 100644 index 0000000..3d32b46 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p6/mul_basecase.asm @@ -0,0 +1,589 @@ +dnl PowerPC-64 mpn_mul_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 12.25 + +C TODO +C * Reduce register usage. At least 4 register less can be used. +C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling +C would bring us to 9 c/l. +C * The bdz insns for b1 and b2 will never branch, +C * Align things better, perhaps by moving things like pointer updates from +C before to after loops. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') +define(`vp', `r6') +define(`vn', `r7') + +define(`v0', `r25') +define(`outer_rp', `r22') +define(`outer_up', `r23') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + +C Special code for un <= 2, for efficiency of these important cases, +C and since it simplifies the default code. + cmpdi cr0, un, 2 + bgt cr0, L(un_gt2) + cmpdi cr6, vn, 1 + ld r7, 0(vp) + ld r5, 0(up) + mulld r8, r5, r7 C weight 0 + mulhdu r9, r5, r7 C weight 1 + std r8, 0(rp) + beq cr0, L(2x) + std r9, 8(rp) + blr + ALIGN(16) +L(2x): ld r0, 8(up) + mulld r8, r0, r7 C weight 1 + mulhdu r10, r0, r7 C weight 2 + addc r9, r9, r8 + addze r10, r10 + bne cr6, L(2x2) + std r9, 8(rp) + std r10, 16(rp) + blr + ALIGN(16) +L(2x2): ld r6, 8(vp) + nop + mulld r8, r5, r6 C weight 1 + mulhdu r11, r5, r6 C weight 2 + mulld r12, r0, r6 C weight 2 + mulhdu r0, r0, r6 C weight 3 + addc r9, r9, r8 + std r9, 8(rp) + adde r11, r11, r10 + addze r0, r0 + addc r11, r11, r12 + addze r0, r0 + std r11, 16(rp) + std r0, 24(rp) + blr + +L(un_gt2): + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + std r21, -88(r1) + std r20, -96(r1) + + mr outer_rp, rp + mr outer_up, up + + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 0(up) + + rldicl. r0, un, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi un, un, 4 C compute count... + srdi un, un, 2 C ...for ctr + mtctr un C copy inner loop count into ctr + beq cr0, L(b0) + blt cr6, L(b1) + beq cr6, L(b2) + + + ALIGN(16) +L(b3): + ld r27, 8(up) + ld r20, 16(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r10, r20, v0 + addc r24, r24, r31 + adde r9, r9, r8 + addze r12, r10 + std r0, 0(rp) + std r24, 8(rp) + std r9, 16(rp) + addi up, up, 16 + addi rp, rp, 16 + bdz L(end_m_3) + + ALIGN(32) +L(lo_m_3): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) + ld r21, 32(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r27, r20, v0 + mulld r11, r21, v0 + mulhdu r26, r21, v0 + adde r0, r0, r12 + adde r24, r24, r31 + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r27 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + mr r12, r26 + bdnz L(lo_m_3) + + ALIGN(16) +L(end_m_3): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + beq L(ret) + + ALIGN(16) +L(outer_lo_3): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 24 + addi up, outer_up, 16 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, -16(up) + ld r27, -8(up) + ld r20, 0(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r10, r20, v0 + ld r28, -16(rp) + ld r29, -8(rp) + ld r30, 0(rp) + addc r24, r24, r31 + adde r9, r9, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, -16(rp) + adde r24, r24, r29 + std r24, -8(rp) + adde r9, r9, r30 + std r9, 0(rp) + bdz L(end_3) + + ALIGN(32) C registers dying +L(lo_3): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) C + ld r21, 32(up) C + addi up, up, 32 C + addi rp, rp, 32 C + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + mulld r9, r20, v0 C + mulhdu r27, r20, v0 C 26 + mulld r11, r21, v0 C + mulhdu r26, r21, v0 C 27 + ld r28, -24(rp) C + adde r0, r0, r12 C 0 12 + ld r29, -16(rp) C + adde r24, r24, r10 C 24 10 + ld r30, -8(rp) C + ld r31, 0(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r27 C 27 11 + addze r12, r26 C 26 + addc r0, r0, r28 C 0 28 + std r0, -24(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, -16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, -8(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 0(rp) C 11 + bdnz L(lo_3) C + + ALIGN(16) +L(end_3): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + bne L(outer_lo_3) + b L(ret) + + + ALIGN(16) +L(b1): + mulld r0, r26, v0 + mulhdu r12, r26, v0 + addic r0, r0, 0 + std r0, 0(rp) + bdz L(end_m_1) + + ALIGN(16) +L(lo_m_1): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) + ld r21, 32(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r27, r20, v0 + mulld r11, r21, v0 + mulhdu r26, r21, v0 + adde r0, r0, r12 + adde r24, r24, r31 + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r27 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + mr r12, r26 + bdnz L(lo_m_1) + + ALIGN(16) +L(end_m_1): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + beq L(ret) + + ALIGN(16) +L(outer_lo_1): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 8 + mr up, outer_up + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, 0(up) + ld r28, 0(rp) + mulld r0, r26, v0 + mulhdu r12, r26, v0 + addc r0, r0, r28 + std r0, 0(rp) + bdz L(end_1) + + ALIGN(32) C registers dying +L(lo_1): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) C + ld r21, 32(up) C + addi up, up, 32 C + addi rp, rp, 32 C + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + mulld r9, r20, v0 C + mulhdu r27, r20, v0 C 26 + mulld r11, r21, v0 C + mulhdu r26, r21, v0 C 27 + ld r28, -24(rp) C + adde r0, r0, r12 C 0 12 + ld r29, -16(rp) C + adde r24, r24, r10 C 24 10 + ld r30, -8(rp) C + ld r31, 0(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r27 C 27 11 + addze r12, r26 C 26 + addc r0, r0, r28 C 0 28 + std r0, -24(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, -16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, -8(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 0(rp) C 11 + bdnz L(lo_1) C + + ALIGN(16) +L(end_1): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + bne L(outer_lo_1) + b L(ret) + + + ALIGN(16) +L(b0): + addi up, up, -8 + addi rp, rp, -8 + li r12, 0 + addic r12, r12, 0 + bdz L(end_m_0) + + ALIGN(16) +L(lo_m_0): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) + ld r21, 32(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r27, r20, v0 + mulld r11, r21, v0 + mulhdu r26, r21, v0 + adde r0, r0, r12 + adde r24, r24, r31 + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r27 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + mr r12, r26 + bdnz L(lo_m_0) + + ALIGN(16) +L(end_m_0): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + beq L(ret) + + ALIGN(16) +L(outer_lo_0): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 0 + addi up, outer_up, -8 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + li r12, 0 + addic r12, r12, 0 + bdz L(end_0) + + ALIGN(32) C registers dying +L(lo_0): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) C + ld r21, 32(up) C + addi up, up, 32 C + addi rp, rp, 32 C + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + mulld r9, r20, v0 C + mulhdu r27, r20, v0 C 26 + mulld r11, r21, v0 C + mulhdu r26, r21, v0 C 27 + ld r28, -24(rp) C + adde r0, r0, r12 C 0 12 + ld r29, -16(rp) C + adde r24, r24, r10 C 24 10 + ld r30, -8(rp) C + ld r31, 0(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r27 C 27 11 + addze r12, r26 C 26 + addc r0, r0, r28 C 0 28 + std r0, -24(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, -16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, -8(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 0(rp) C 11 + bdnz L(lo_0) C + + ALIGN(16) +L(end_0): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + bne L(outer_lo_0) + b L(ret) + + + ALIGN(16) +L(b2): ld r27, 8(up) + addi up, up, 8 + mulld r0, r26, v0 + mulhdu r10, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + addc r24, r24, r10 + addze r12, r8 + std r0, 0(rp) + std r24, 8(rp) + addi rp, rp, 8 + bdz L(end_m_2) + + ALIGN(16) +L(lo_m_2): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) + ld r21, 32(up) + mulld r0, r26, v0 + mulhdu r31, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + mulld r9, r20, v0 + mulhdu r27, r20, v0 + mulld r11, r21, v0 + mulhdu r26, r21, v0 + adde r0, r0, r12 + adde r24, r24, r31 + std r0, 8(rp) + adde r9, r9, r8 + std r24, 16(rp) + adde r11, r11, r27 + std r9, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + mr r12, r26 + bdnz L(lo_m_2) + + ALIGN(16) +L(end_m_2): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + beq L(ret) + + ALIGN(16) +L(outer_lo_2): + mtctr un C copy inner loop count into ctr + addi rp, outer_rp, 16 + addi up, outer_up, 8 + addi outer_rp, outer_rp, 8 + ld v0, 0(vp) C new v limb + addi vp, vp, 8 + ld r26, -8(up) + ld r27, 0(up) + ld r28, -8(rp) + ld r29, 0(rp) + mulld r0, r26, v0 + mulhdu r10, r26, v0 + mulld r24, r27, v0 + mulhdu r8, r27, v0 + addc r24, r24, r10 + addze r12, r8 + addc r0, r0, r28 + std r0, -8(rp) + adde r24, r24, r29 + std r24, 0(rp) + bdz L(end_2) + + ALIGN(16) C registers dying +L(lo_2): + ld r26, 8(up) + ld r27, 16(up) + ld r20, 24(up) C + ld r21, 32(up) C + addi up, up, 32 C + addi rp, rp, 32 C + mulld r0, r26, v0 C + mulhdu r10, r26, v0 C 26 + mulld r24, r27, v0 C + mulhdu r8, r27, v0 C 27 + mulld r9, r20, v0 C + mulhdu r27, r20, v0 C 26 + mulld r11, r21, v0 C + mulhdu r26, r21, v0 C 27 + ld r28, -24(rp) C + adde r0, r0, r12 C 0 12 + ld r29, -16(rp) C + adde r24, r24, r10 C 24 10 + ld r30, -8(rp) C + ld r31, 0(rp) C + adde r9, r9, r8 C 8 9 + adde r11, r11, r27 C 27 11 + addze r12, r26 C 26 + addc r0, r0, r28 C 0 28 + std r0, -24(rp) C 0 + adde r24, r24, r29 C 7 29 + std r24, -16(rp) C 7 + adde r9, r9, r30 C 9 30 + std r9, -8(rp) C 9 + adde r11, r11, r31 C 11 31 + std r11, 0(rp) C 11 + bdnz L(lo_2) C + + ALIGN(16) +L(end_2): + addze r12, r12 + addic. vn, vn, -1 + std r12, 8(rp) + bne L(outer_lo_2) +C b L(ret) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + ld r21, -88(r1) + ld r20, -96(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm new file mode 100644 index 0000000..8731e01 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aormul_2.asm @@ -0,0 +1,135 @@ +dnl PowerPC-64 mpn_mul_2 and mpn_addmul_2. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C mul_2 addmul_2 +C POWER3/PPC630 ? ? +C POWER4/PPC970 ? ? +C POWER5 ? ? +C POWER6 ? ? +C POWER7-SMT4 3 3 +C POWER7-SMT2 ? ? +C POWER7-SMT1 ? ? + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`vp', `r6') + +define(`cy0', `r10') +ifdef(`EXTRA_REGISTER', +` define(`cy1', EXTRA_REGISTER)', +` define(`cy1', `r31')') + +ifdef(`OPERATION_mul_2',` + define(`AM', `') + define(`ADDX', `addc') + define(`func', `mpn_mul_2') +') +ifdef(`OPERATION_addmul_2',` + define(`AM', `$1') + define(`ADDX', `adde') + define(`func', `mpn_addmul_2') +') + +MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2) + +ASM_START() +PROLOGUE(func) + +ifdef(`EXTRA_REGISTER',,` + std r31, -8(r1) +') + andi. r12, n, 1 + addi r0, n, 1 + srdi r0, r0, 1 + mtctr r0 + ld r11, 0(vp) C v0 + li cy0, 0 + ld r12, 8(vp) C v1 + li cy1, 0 + ld r5, 0(up) + beq L(lo0) + addi up, up, -8 + addi rp, rp, -8 + b L(lo1) + + ALIGN(32) +L(top): +AM(` ld r0, -8(rp)') + ld r5, 0(up) +AM(` addc r6, r6, r0') + ADDX r7, r7, r8 + addze r9, r9 + addc r6, r6, cy0 + adde cy0, r7, cy1 + std r6, -8(rp) + addze cy1, r9 +L(lo0): mulld r6, r11, r5 C v0 * u[i] weight 0 + mulhdu r7, r11, r5 C v0 * u[i] weight 1 + mulld r8, r12, r5 C v1 * u[i] weight 1 + mulhdu r9, r12, r5 C v1 * u[i] weight 2 +AM(` ld r0, 0(rp)') + ld r5, 8(up) +AM(` addc r6, r6, r0') + ADDX r7, r7, r8 + addze r9, r9 + addc r6, r6, cy0 + adde cy0, r7, cy1 + std r6, 0(rp) + addze cy1, r9 +L(lo1): mulld r6, r11, r5 C v0 * u[i] weight 0 + mulhdu r7, r11, r5 C v0 * u[i] weight 1 + addi up, up, 16 + addi rp, rp, 16 + mulld r8, r12, r5 C v1 * u[i] weight 1 + mulhdu r9, r12, r5 C v1 * u[i] weight 2 + bdnz L(top) + +L(end): +AM(` ld r0, -8(rp)') +AM(` addc r6, r6, r0') + ADDX r7, r7, r8 + addze r9, r9 + addc r6, r6, cy0 + std r6, -8(rp) + adde cy0, r7, cy1 + addze cy1, r9 + std cy0, 0(rp) + mr r3, cy1 + +ifdef(`EXTRA_REGISTER',,` + ld r31, -8(r1) +') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm new file mode 100644 index 0000000..857c701 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aors_n.asm @@ -0,0 +1,128 @@ +dnl PowerPC-64 mpn_add_n, mpn_sub_n optimised for POWER7. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 ? +C POWER7 2.18 + +C This is a tad bit slower than the cnd_aors_n.asm code, which is of course an +C anomaly. + +ifdef(`OPERATION_add_n',` + define(ADDSUBC, adde) + define(ADDSUB, addc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc) + define(GENRVAL, `addi r3, r3, 1') + define(SETCBR, `addic r0, $1, -1') + define(CLRCB, `addic r0, r0, 0') +') +ifdef(`OPERATION_sub_n',` + define(ADDSUBC, subfe) + define(ADDSUB, subfc) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc) + define(GENRVAL, `neg r3, r3') + define(SETCBR, `subfic r0, $1, 0') + define(CLRCB, `addic r0, r1, -1') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ASM_START() +PROLOGUE(func_nc) + SETCBR(r7) + b L(ent) +EPILOGUE() + +PROLOGUE(func) + CLRCB +L(ent): + andi. r7, n, 1 + beq L(bx0) + +L(bx1): ld r7, 0(up) + ld r9, 0(vp) + ADDSUBC r11, r9, r7 + std r11, 0(rp) + cmpldi cr6, n, 1 + beq cr6, L(end) + addi up, up, 8 + addi vp, vp, 8 + addi rp, rp, 8 + +L(bx0): addi r0, n, 2 C compute branch... + srdi r0, r0, 2 C ...count + mtctr r0 + + andi. r7, n, 2 + bne L(mid) + + addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, 16 + + ALIGN(32) +L(top): ld r6, -16(up) + ld r7, -8(up) + ld r8, -16(vp) + ld r9, -8(vp) + ADDSUBC r10, r8, r6 + ADDSUBC r11, r9, r7 + std r10, -16(rp) + std r11, -8(rp) +L(mid): ld r6, 0(up) + ld r7, 8(up) + ld r8, 0(vp) + ld r9, 8(vp) + ADDSUBC r10, r8, r6 + ADDSUBC r11, r9, r7 + std r10, 0(rp) + std r11, 8(rp) + addi up, up, 32 + addi vp, vp, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): subfe r3, r0, r0 C -cy + GENRVAL + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm new file mode 100644 index 0000000..ddf5fd8 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm @@ -0,0 +1,43 @@ +dnl PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm new file mode 100644 index 0000000..3f9d88d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm @@ -0,0 +1,43 @@ +dnl PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm new file mode 100644 index 0000000..5251202 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm @@ -0,0 +1,129 @@ +dnl PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 ? +C POWER7 2.5 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ifdef(`DO_add', ` + define(`ADDSUBC', `addc $1, $2, $3') + define(`ADDSUBE', `adde $1, $2, $3') + define(INITCY, `addic $1, r1, 0') + define(RETVAL, `addze r3, $1') + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUBC', `subfc $1, $2, $3') + define(`ADDSUBE', `subfe $1, $2, $3') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `subfze r3, $1 + neg r3, r3') + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUBC', `subfc $1, $3, $2') + define(`ADDSUBE', `subfe $1, $3, $2') + define(INITCY, `addic $1, r1, -1') + define(RETVAL, `addme r3, $1') + define(`func', mpn_rsblsh`'LSH`'_n)') + +define(`s0', `r0') define(`s1', `r9') +define(`u0', `r6') define(`u1', `r7') +define(`v0', `r10') define(`v1', `r11') + + +ASM_START() +PROLOGUE(func) + rldic r7, n, 3, 59 + add up, up, r7 + add vp, vp, r7 + add rp, rp, r7 + +ifdef(`DO_add', ` + addic r0, n, 3 C set cy flag as side effect +',` + subfc r0, r0, r0 C set cy flag + addi r0, n, 3 +') + srdi r0, r0, 2 + mtctr r0 + + andi. r0, n, 1 + beq L(bx0) + +L(bx1): andi. r0, n, 2 + li s0, 0 + bne L(lo3) + b L(lo1) + +L(bx0): andi. r0, n, 2 + li s1, 0 + bne L(lo2) + + ALIGN(32) +L(top): addi rp, rp, 32 + ld v0, 0(vp) + addi vp, vp, 32 + rldimi s1, v0, LSH, 0 + ld u0, 0(up) + addi up, up, 32 + srdi s0, v0, RSH + ADDSUBE(s1, s1, u0) + std s1, -32(rp) +L(lo3): ld v1, -24(vp) + rldimi s0, v1, LSH, 0 + ld u1, -24(up) + srdi s1, v1, RSH + ADDSUBE(s0, s0, u1) + std s0, -24(rp) +L(lo2): ld v0, -16(vp) + rldimi s1, v0, LSH, 0 + ld u0, -16(up) + srdi s0, v0, RSH + ADDSUBE(s1, s1, u0) + std s1, -16(rp) +L(lo1): ld v1, -8(vp) + rldimi s0, v1, LSH, 0 + ld u1, -8(up) + srdi s1, v1, RSH + ADDSUBE(s0, s0, u1) + std s0, -8(rp) + bdnz L(top) C decrement CTR and loop back + + RETVAL( s1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm new file mode 100644 index 0000000..f04e896 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_11.asm @@ -0,0 +1,67 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 7.6 obsolete +C POWER8 ? +C POWER9 ? +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +C INPUT PARAMETERS +define(`u0', `r3') +define(`v0', `r4') + +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + li r12, 63 + b L(odd) + + ALIGN(16) +L(top): and r8, r11, r10 C isolate lsb + cntlzd cnt, r8 + isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + subf cnt, cnt, r12 C cnt = 63-cnt + srd u0, u0, cnt +L(odd): cmpld cr7, v0, u0 + subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + bne cr7, L(top) + +L(end): blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm new file mode 100644 index 0000000..ade30e4 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gcd_22.asm @@ -0,0 +1,146 @@ +dnl PowerPC-64 mpn_gcd_22 optimised for POWER7 and POWER8. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 12.3 +C POWER8 13.4 +C POWER9 10.6 + +C We define SLOW if this target uses a slow struct return mechanism, with +C r3 as an implicit parameter for the struct pointer. +undefine(`SLOW')dnl +ifdef(`AIX',`define(`SLOW',`due to AIX')',` + ifdef(`DARWIN',,` + ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl + ') +') + +ifdef(`SLOW',` +define(`IFSLOW', `$1') +define(`u1', `r4') +define(`u0', `r5') +define(`v1', `r6') +define(`v0', `r7') +',` +define(`IFSLOW', `') +define(`u1', `r3') +define(`u0', `r4') +define(`v1', `r5') +define(`v0', `r6') +') + +define(`tmp', `r0') +define(`t0', `r8') +define(`t1', `r9') +define(`s0', `r10') +define(`s1', `r11') +define(`cnt', `r12') + +ASM_START() +PROLOGUE(mpn_gcd_22) +L(top): subfc. t0, v0, u0 C 0 12 + beq cr0, L(lowz) + subfe t1, v1, u1 C 2 14 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subfc s0, u0, v0 C 0 + subfe s1, u1, v1 C 2 + +L(bck): and tmp, s0, t0 C 2 + cntlzd cnt, tmp C 4 + addi tmp, cnt, 1 C 6 + subfic cnt, cnt, 63 C 6 + + isel v0, v0, u0, 2 C 6 use condition set by subfe + isel v1, v1, u1, 2 C 6 + isel u0, t0, s0, 2 C 6 + isel u1, t1, s1, 2 C 6 + + srd u0, u0, cnt C 8 + sld tmp, u1, tmp C 8 + srd u1, u1, cnt C 8 + or u0, u0, tmp C 10 + + or. r0, u1, v1 C 10 + bne L(top) + + + li r0, 63 + b L(odd) + ALIGN(16) +L(top1):isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + subf cnt, cnt, r0 C cnt = 63-cnt + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + and r8, r11, r10 C isolate lsb + cntlzd cnt, r8 + bne cr7, L(top1) + +ifdef(`SLOW',` + std v0, 0(r3) + std r10, 8(r3) C zero +',` + mr r3, v0 + li r4, 0 +') + blr + + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subfc. t0, v1, u1 C 2 8 + beq L(end) + li t1, 0 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subf s0, u1, v1 C 2 + li s1, 0 + b L(bck) + +L(end): +ifdef(`SLOW',` + std v0, 0(r3) + std v1, 8(r3) + blr +',` + mr r3, v0 + mr r4, v1 + blr +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h new file mode 100644 index 0000000..9da4080 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p7/gmp-mparam.h @@ -0,0 +1,175 @@ +/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3720 MHz POWER7/SMT4 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-10-02, gcc 4.8 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 0 +/* From gcc110.osuosl.org, 2023-07-27 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 8.45% faster than 4 */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 341 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 71 +#define MUL_TOOM44_THRESHOLD 196 +#define MUL_TOOM6H_THRESHOLD 298 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 140 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 139 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 120 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 105 +#define SQR_TOOM4_THRESHOLD 190 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 56 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 20 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 33, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \ + { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 79,11}, { 47,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 83 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 368, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \ + { 639,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 84 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 9449 +#define SQRLO_BASECASE_THRESHOLD 3 +#define SQRLO_DC_THRESHOLD 119 +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 33 +#define DC_DIVAPPR_Q_THRESHOLD 124 +#define DC_BDIV_QR_THRESHOLD 62 +#define DC_BDIV_Q_THRESHOLD 144 + +#define INV_MULMOD_BNM1_THRESHOLD 67 +#define INV_NEWTON_THRESHOLD 123 +#define INV_APPR_THRESHOLD 123 + +#define BINV_NEWTON_THRESHOLD 284 +#define REDC_1_TO_REDC_2_THRESHOLD 18 +#define REDC_2_TO_REDC_N_THRESHOLD 109 + +#define MU_DIV_QR_THRESHOLD 1387 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 50 +#define MU_BDIV_QR_THRESHOLD 1308 +#define MU_BDIV_Q_THRESHOLD 1499 + +#define POWM_SEC_TABLE 1,23,121,579,642 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 1562 +#define SET_STR_PRECOMPUTE_THRESHOLD 3100 + +#define FAC_DSC_THRESHOLD 774 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 18 +#define HGCD2_DIV1_METHOD 5 /* 3.27% faster than 3 */ +#define HGCD_THRESHOLD 118 +#define HGCD_APPR_THRESHOLD 150 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 4 /* 27.64% faster than 1 */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h new file mode 100644 index 0000000..09348e0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p8/gmp-mparam.h @@ -0,0 +1,171 @@ +/* POWER8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 4150 MHz POWER8/SMT4 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-09-24, gcc 7.2 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 0 +/* From gcc112.osuosl.org, 2023-07-27 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 13.00% faster than 4 */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD 9 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 34 + +#define DIV_1_VS_MUL_1_PERCENT 276 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 278 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 131 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 138 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 303 +#define SQR_TOOM8_THRESHOLD 454 + +#define MULMID_TOOM42_THRESHOLD 42 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 79,11}, { 47,10}, { 95,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \ + { 79,10}, { 159,11}, { 95, 8}, { 767, 7}, \ + { 1599,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 80 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79, 9}, { 319,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 71 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 9174 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 114 +#define SQRLO_SQR_THRESHOLD 6461 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 158 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 112 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 132 +#define INV_APPR_THRESHOLD 131 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_2_THRESHOLD 56 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 46 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 3,19,117,672,1867 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 608 +#define SET_STR_PRECOMPUTE_THRESHOLD 2405 + +#define FAC_DSC_THRESHOLD 164 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 1 /* 6.88% faster than 3 */ +#define HGCD_THRESHOLD 114 +#define HGCD_APPR_THRESHOLD 118 +#define HGCD_REDUCE_THRESHOLD 2205 +#define GCD_DC_THRESHOLD 440 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 1 /* 0.74% faster than 4 */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm new file mode 100644 index 0000000..53ea0e0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p8/invert_limb.asm @@ -0,0 +1,53 @@ +dnl PowerPC-64 mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb (approximate) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 ? +C POWER8 32 + +C This runs on POWER7 and later, but is faster only on later CPUs. +C We might want to inline this, considering its small footprint. + +ASM_START() +PROLOGUE(mpn_invert_limb) + sldi. r4, r3, 1 + neg r5, r3 + divdeu( r3, r5, r3) + beq- L(1) + blr +L(1): li r3, -1 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm new file mode 100644 index 0000000..2426a00 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/add_n_sub_n.asm @@ -0,0 +1,112 @@ +dnl PowerPC-64 mpn_add_n_sub_n optimised for POWER9. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 2.25 + + +C INPUT PARAMETERS +define(`arp', `r3') +define(`srp', `r4') +define(`up', `r5') +define(`vp', `r6') +define(`n', `r7') + +ASM_START() +PROLOGUE(mpn_add_n_sub_n) + cmpdi cr7, n, 2 + subfo r0, r0, r0 C clear OV + rldicl. r9, n, 0, 63 C n & 1 + beq cr0, L(bx0) + +L(bx1): ld r10, 0(up) + ld r11, 0(vp) + ble cr7, L(1) + srdi r7, r7, 1 + mtctr r7 + ld r8, 8(up) + ld r9, 8(vp) + addex( r0, r10, r11, 0) + subfc r12, r11, r10 + addi up, up, -8 + addi vp, vp, -8 + b L(lo1) + +L(bx0): ld r8, 0(up) + ld r9, 0(vp) + ld r10, 8(up) + ld r11, 8(vp) + addex( r0, r8, r9, 0) + subfc r12, r9, r8 + addi arp, arp, 8 + addi srp, srp, 8 + ble cr7, L(end) + addi r7, r7, -1 + srdi r7, r7, 1 + mtctr r7 + +L(top): ld r8, 16(up) + ld r9, 16(vp) + std r0, -8(arp) + std r12, -8(srp) + addex( r0, r10, r11, 0) + subfe r12, r11, r10 +L(lo1): ld r10, 24(up) + ld r11, 24(vp) + std r0, 0(arp) + std r12, 0(srp) + addex( r0, r8, r9, 0) + subfe r12, r9, r8 + addi up, up, 16 + addi vp, vp, 16 + addi arp, arp, 16 + addi srp, srp, 16 + bdnz L(top) + +L(end): std r0, -8(arp) + std r12, -8(srp) +L(1): addex( r0, r10, r11, 0) + subfe r12, r11, r10 + std r0, 0(arp) + std r12, 0(srp) + subfe r3, r3, r3 + addex( r3, r3, r3, 0) + rldicl r3, r3, 1, 62 + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm new file mode 100644 index 0000000..95b8faa --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addaddmul_1msb0.asm @@ -0,0 +1,106 @@ +dnl Power9 mpn_addaddmul_1msb0 + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 1-way 2-way 4-way 8-way 16-way mul_1+addmul_1 +C power9: 4.55 3.87 3.55 3.35 3.25 5.16 + +C TODO +C * Only WAYS = 4 currently has proper feed-in code. +C * Try ldu/stdu to save the explicit updates. +C * Try using madd in a long dependent chain, only breaking the recurrency +C once per iteration. +C * Some cycles could perhaps be saved by scheduling the crX-setting insns. + +define(`rp', r3) +define(`ap', r4) +define(`bp', r5) +define(`n', r6) +define(`u0', r7) +define(`v0', r8) + +define(`BLOCK',` +L(lo`'eval((WAYS-$1)%4)): + ld r10, eval(8*$1)(ap) + ld r11, eval(8*$1)(bp) + mulld r12, r10, u0 + mulhdu r10, r10, u0 + maddld( r6, r11, v0, r12) + maddhdu(r11, r11, v0, r12) + adde r12, r6, r0 + std r12, eval(8*$1)(rp) + add r0, r10, r11') + +ifdef(`WAYS',,`define(`WAYS',4)') + +PROLOGUE(mpn_addaddmul_1msb0) + addi r10, n, WAYS-1 + srdi r10, r10, m4_log2(WAYS) + mtctr r10 + addic r0, r3, 0 + li r0, 0 +ifelse(WAYS,4,` + rldicl. r9, n, 0, 63 + rldicl r10, n, 63, 63 + cmpdi cr7, r10, 0 + bne cr0, L(bx1) + +L(bx0): beq cr7, L(lo0) + +L(b10): addi ap, ap, -16 + addi bp, bp, -16 + addi rp, rp, -16 + b L(lo2) + +L(bx1): bne cr7, L(b11) + +L(b01): addi ap, ap, -24 + addi bp, bp, -24 + addi rp, rp, -24 + b L(lo1) + +L(b11): addi ap, ap, -8 + addi bp, bp, -8 + addi rp, rp, -8 + b L(lo3) +') + +L(top): forloop(i,0,eval(WAYS-1),`BLOCK(i)') + + addi ap, ap, eval(8*WAYS) + addi bp, bp, eval(8*WAYS) + addi rp, rp, eval(8*WAYS) + bdnz L(top) + + addze r3, r0 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm new file mode 100644 index 0000000..8f49606 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_1.asm @@ -0,0 +1,130 @@ +dnl Power9 mpn_addmul_1. + +dnl Copyright 2017, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 2.5 + +C TODO +C * Schedule for Power9 pipeline. +C * Unroll 4x if that proves beneficial. +C * This is marginally faster (but much smaller) than ../aorsmul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpdi cr6, n, 2 + addi r0, n, -1 C FIXME: postpone + srdi r0, r0, 1 C FIXME: postpone + mtctr r0 C FIXME: postpone + rldicl. r0, n, 0,63 C r0 = n & 3, set cr0 + bne cr0, L(b1) + +L(b0): ld r10, 0(rp) + ld r12, 0(up) + ld r11, 8(rp) + ld r0, 8(up) + maddld( r9, r12, v0, r10) + maddhdu(r7, r12, v0, r10) + ble cr6, L(2) + ld r10, 16(rp) + ld r12, 16(up) + maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + addic up, up, 16 + addi rp, rp, -8 + b L(mid) + +L(b1): ld r11, 0(rp) + ld r0, 0(up) + ble cr6, L(1) + ld r10, 8(rp) + ld r12, 8(up) + maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + ld r11, 16(rp) + ld r0, 16(up) + maddld( r9, r12, v0, r10) + maddhdu(r7, r12, v0, r10) + addic up, up, 24 + bdz L(end) + + ALIGN(16) +L(top): ld r10, 24(rp) + ld r12, 0(up) + std r8, 0(rp) + adde r9, r5, r9 + maddld( r8, r0, v0, r11) C W:0,2,4 + maddhdu(r5, r0, v0, r11) C W:1,3,5 +L(mid): ld r11, 32(rp) + ld r0, 8(up) + std r9, 8(rp) + adde r8, r7, r8 + maddld( r9, r12, v0, r10) C W:1,3,5 + maddhdu(r7, r12, v0, r10) C W:2,4,6 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(top) + +L(end): std r8, 0(rp) + maddld( r8, r0, v0, r11) + adde r9, r5, r9 + maddhdu(r5, r0, v0, r11) + std r9, 8(rp) + adde r8, r7, r8 + std r8, 16(rp) + addze r3, r5 + blr + +L(2): maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + std r9, 0(rp) + addc r8, r7, r8 + std r8, 8(rp) + addze r3, r5 + blr + +L(1): maddld( r8, r0, v0, r11) + std r8, 0(rp) + maddhdu(r3, r0, v0, r11) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm new file mode 100644 index 0000000..846a894 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/addmul_2.asm @@ -0,0 +1,193 @@ +dnl Power9 mpn_addmul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C power9: 1.62 + +C STATUS +C * Not written with any power9 pipeline understanding. +C * The 4x unrolling was not motivated by any timing tests. +C * No local scheduling for performance tweaking has been done. +C * Decrease load scheduling! + +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') C Note: Reused as scratch +define(`vp', `r6') C Note: Reused for v1 + +define(`v0', `r7') +define(`v1', `r6') + + +ASM_START() +PROLOGUE(mpn_addmul_2) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + subfic r0, r1, 0 C clear CA + subfo r0, r0, r0 C clear OV and r0 + + cmpdi cr7, n, 4 + + ld v0, 0(vp) + ld v1, 8(vp) + + srdi r10, n, 2 + mtctr r10 + + rldicl. r9, n, 0, 63 + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 + + ld r28, 0(rp) + ld r8, 0(up) + ld r11, 8(rp) + ld r9, 8(up) + maddld( r26, r8, v0, r28) + maddhdu(r31, r8, v0, r28) + blt cr7, L(2) + ld r28, 16(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + bne cr0, L(b10) + +L(b00): addi up, up, -8 + addi rp, rp, -24 + b L(lo0) + +L(b10): addi up, up, 8 + addi rp, rp, -8 + b L(lo2) + +L(2): addi rp, rp, -8 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 + + ld r29, 0(rp) + ld r9, 0(up) + ld r10, 8(rp) + ld r8, 8(up) + maddld( r27, r9, v0, r29) + maddhdu(r30, r9, v0, r29) + ld r29, 16(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + b L(lo1) +L(b11): addi up, up, 16 + blt cr7, L(end) + +L(top): ld r9, 0(up) + maddld( r26, r8, v0, r10) C 0 4 -> adde + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r27, r0 C 7 11 + ld r28, 24(rp) + std r0, 0(rp) + maddld( r5, r8, v1, r29) C 1 5 -> addex + maddhdu(r10, r8, v1, r29) C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(lo2): ld r8, 8(up) + maddld( r27, r9, v0, r11) C 1 5 -> adde + maddhdu(r30, r9, v0, r11) C 2 6 + adde r0, r26, r0 C 8 12 + ld r29, 32(rp) + std r0, 8(rp) + maddld( r12, r9, v1, r28) C 2 6 -> addex + maddhdu(r11, r9, v1, r28) C 3 7 + addex( r0, r5, r31, 0) C 5 9 13 +L(lo1): ld r9, 16(up) + maddld( r26, r8, v0, r10) C 2 6 -> adde + maddhdu(r31, r8, v0, r10) C 3 7 + adde r0, r27, r0 C 5 9 13 + ld r28, 40(rp) + std r0, 16(rp) + maddld( r5, r8, v1, r29) C 3 7 -> addex + maddhdu(r10, r8, v1, r29) C 4 8 + addex( r0, r12, r30, 0) C 6 10 +L(lo0): ld r8, 24(up) + maddld( r27, r9, v0, r11) C 3 7 -> adde + maddhdu(r30, r9, v0, r11) C 4 8 + adde r0, r26, r0 C 6 10 + ld r29, 48(rp) + std r0, 24(rp) + maddld( r12, r9, v1, r28) C 4 8 -> addex + maddhdu(r11, r9, v1, r28) C 5 9 + addex( r0, r5, r31, 0) C 7 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r26, r8, v0, r10) C 0 4 + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r27, r0 C 7 11 + std r0, 0(rp) C -4 + maddld( r5, r8, v1, r29) C 1 5 + maddhdu(r10, r8, v1, r29) C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(cj2): maddld( r27, r9, v0, r11) C 1 5 -2 + maddhdu(r30, r9, v0, r11) C 2 6 -1 + adde r0, r26, r0 C 8 12 -3 + std r0, 8(rp) C -3 + mulld r12, r9, v1 C 2 6 -1 + mulhdu r11, r9, v1 C 3 7 0 = return limb + addex( r0, r5, r31, 0) C 5 9 13 + adde r0, r27, r0 C 5 9 13 -2 + std r0, 16(rp) C -2 + addex( r0, r12, r30, 0) C 6 10 -1 + adde r0, r0, r10 C -1 + std r0, 24(rp) C -1 + li r4, 0 + addze r3, r11 + addex( r3, r3, r4, 0) + +L(ret): ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm new file mode 100644 index 0000000..e4ca3a8 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/aorsmul_1.asm @@ -0,0 +1,179 @@ +dnl POWER9 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 - - +C POWER4/PPC970 - - +C POWER5 - - +C POWER6 - - +C POWER7 - - +C POWER8 - - +C POWER9 2.63 2.63 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUBC', adde) + define(`ADDSUB', addc) + define(`func', mpn_addmul_1) + define(`AM', `$1') + define(`SM', `') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUBC', subfe) + define(`ADDSUB', subfc) + define(`func', mpn_submul_1) + define(`AM', `') + define(`SM', `$1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + cmpdi cr7, n, 3 + srdi r10, n, 2 + mtctr r10 + rldicl. r9, n, 0, 63 + ld r11, 0(up) + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 +AM(` subfzeo r12, n ') C ov = 0, ca = 0 +AM(` li r12, 0 ') +SM(` subfco r12, r12, r12 ') C r12 = 0, ov = 0, ca = 1 + ld r9, 8(up) + mulld r0, r11, v0 + mulhdu r5, r11, v0 + blt cr7, L(2) + ld r8, 16(up) + bne cr0, L(b10) + +L(b00): addi rp, rp, -24 + b L(lo0) +L(b10): addi rp, rp, -8 + addi up, up, 16 + b L(lo2) + +L(2): addi rp, rp, -8 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 +AM(` subfzeo r5, n ') C ov = 0, ca = 0 +AM(` li r5, 0 ') +SM(` subfco r5, r5, r5 ') C r5 = 0, ov = 0, ca = 1 + blt cr7, L(1) + ld r8, 8(up) + mulld r7, r11, v0 + mulhdu r12, r11, v0 + ld r9, 16(up) + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + addi up, up, 8 + b L(lo1) + +L(1): mulld r7, r11, v0 + mulhdu r12, r11, v0 + ld r11, 0(rp) + ADDSUB r10, r7, r11 + std r10, 0(rp) +AM(` addze r3, r12 ') +SM(` subfe r0, r0, r0 ') +SM(` sub r3, r12, r0 ') + blr + +L(b11): addi up, up, 24 + ble cr7, L(end) + + ALIGN(16) +L(top): ld r11, 0(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ld r8, 0(up) + ADDSUBC r10, r7, r11 + std r10, 0(rp) +L(lo2): ld r11, 8(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ld r9, 8(up) + ADDSUBC r10, r0, r11 + std r10, 8(rp) +L(lo1): ld r11, 16(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ld r8, 16(up) + ADDSUBC r10, r7, r11 + std r10, 16(rp) +L(lo0): ld r11, 24(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ld r9, 24(up) + ADDSUBC r10, r0, r11 + std r10, 24(rp) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r11, 0(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ADDSUBC r10, r7, r11 + std r10, 0(rp) +L(cj2): ld r11, 8(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ADDSUBC r10, r0, r11 + std r10, 8(rp) + ld r11, 16(rp) + addex( r7, r7, r5, 0) + ADDSUBC r10, r7, r11 + std r10, 16(rp) + li r0, 0 + addex( r3, r12, r0, 0) +AM(` addze r3, r3 ') +SM(` subfe r0, r0, r0 ') +SM(` sub r3, r3, r0 ') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm new file mode 100644 index 0000000..2dc982d --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_11.asm @@ -0,0 +1,64 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 5.75 +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +define(`u0', `r3') +define(`v0', `r4') + +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + b L(odd) + + ALIGN(16) +L(top): isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |v - u| + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + cnttzd cnt, r10 + bne cr7, L(top) + +L(end): blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm new file mode 100644 index 0000000..12d11b0 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gcd_22.asm @@ -0,0 +1,143 @@ +dnl PowerPC-64 mpn_gcd_22 optimised for POWER9. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 9.58 + +C We define SLOW if this target uses a slow struct return mechanism, with +C r3 as an implicit parameter for the struct pointer. +undefine(`SLOW')dnl +ifdef(`AIX',`define(`SLOW',`due to AIX')',` + ifdef(`DARWIN',,` + ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl + ') +') + +ifdef(`SLOW',` +define(`IFSLOW', `$1') +define(`u1', `r4') +define(`u0', `r5') +define(`v1', `r6') +define(`v0', `r7') +',` +define(`IFSLOW', `') +define(`u1', `r3') +define(`u0', `r4') +define(`v1', `r5') +define(`v0', `r6') +') + +define(`tmp', `r0') +define(`t0', `r8') +define(`t1', `r9') +define(`s0', `r10') +define(`s1', `r11') +define(`cnt', `r12') + +ASM_START() +PROLOGUE(mpn_gcd_22) + cmpld cr7, v0, u0 +L(top): subfc t0, v0, u0 C 0 12 + beq cr7, L(lowz) + subfe t1, v1, u1 C 2 14 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subfc s0, u0, v0 C 0 + subfe s1, u1, v1 C 2 + +L(bck): cnttzd cnt, t0 C 2 + subfic tmp, cnt, 64 C 4 + + isel v0, v0, u0, 2 C 6 use condition set by subfe + isel u0, t0, s0, 2 C 6 + isel v1, v1, u1, 2 C 6 + isel u1, t1, s1, 2 C 6 + + srd u0, u0, cnt C 8 + sld tmp, u1, tmp C 8 + srd u1, u1, cnt C 8 + or u0, u0, tmp C 10 + + or. r0, u1, v1 C 10 + cmpld cr7, v0, u0 + bne L(top) + + + b L(odd) + ALIGN(16) +L(top1):isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + cnttzd cnt, r10 + bne cr7, L(top1) + +ifdef(`SLOW',` + std v0, 0(r3) + std r10, 8(r3) +',` + mr r3, v0 + li r4, 0 +') + blr + + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subfc. t0, v1, u1 C 2 8 + beq L(end) + li t1, 0 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subf s0, u1, v1 C 2 + li s1, 0 + b L(bck) + +L(end): +ifdef(`SLOW',` + std v0, 0(r3) + std v1, 8(r3) + blr +',` + mr r3, v0 + mr r4, v1 + blr +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h new file mode 100644 index 0000000..f29a84e --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/gmp-mparam.h @@ -0,0 +1,254 @@ +/* POWER9 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2200MHz POWER9 */ +/* FFT tuning limit = 221,245,838 */ +/* Generated by tuneup.c, 2019-10-29, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 44 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 0 +/* From gcc120.osuosl.org, 2023-07-27 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 6.48% faster than 4 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD 2 +#define DIV_QR_2_PI2_THRESHOLD 7 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 33 + +#define DIV_1_VS_MUL_1_PERCENT 365 + +#define MUL_TOOM22_THRESHOLD 34 +#define MUL_TOOM33_THRESHOLD 109 +#define MUL_TOOM44_THRESHOLD 458 +#define MUL_TOOM6H_THRESHOLD 517 +#define MUL_TOOM8H_THRESHOLD 608 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 292 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 204 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 46 +#define SQR_TOOM3_THRESHOLD 158 +#define SQR_TOOM4_THRESHOLD 674 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 898 + +#define MULMID_TOOM42_THRESHOLD 70 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 25 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 13, 5}, { 27, 6}, { 27, 7}, { 14, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 35, 8}, { 71, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,11}, { 447,10}, \ + { 895,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,13}, { 191,12}, \ + { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \ + { 415,11}, { 831,10}, { 1663,11}, { 863,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \ + { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1471,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,13}, { 895,11}, \ + { 3583,12}, { 1919,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 243 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \ + { 95,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 671,12}, { 351,11}, { 703,10}, { 1407,11}, \ + { 735,13}, { 191,12}, { 383,11}, { 767,10}, \ + { 1535,12}, { 415,11}, { 831,12}, { 447,11}, \ + { 895,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \ + { 1727,13}, { 895,12}, { 1791,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5119,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 230 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 7246 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 40 +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 30 +#define DC_DIVAPPR_Q_THRESHOLD 88 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 62 + +#define INV_MULMOD_BNM1_THRESHOLD 79 +#define INV_NEWTON_THRESHOLD 11 +#define INV_APPR_THRESHOLD 11 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_2_THRESHOLD 8 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1470 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1652 + +#define POWM_SEC_TABLE 1,16,151,839 + +#define GET_STR_DC_THRESHOLD 7 +#define GET_STR_PRECOMPUTE_THRESHOLD 15 +#define SET_STR_DC_THRESHOLD 406 +#define SET_STR_PRECOMPUTE_THRESHOLD 885 + +#define FAC_DSC_THRESHOLD 179 +#define FAC_ODD_THRESHOLD 53 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 9.10% faster than 3 */ +#define HGCD_THRESHOLD 45 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 321 +#define GCDEXT_DC_THRESHOLD 258 +#define JACOBI_BASE_METHOD 4 /* 15.45% faster than 1 */ + +/* Tuneup completed successfully, took 179422 seconds */ diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm new file mode 100644 index 0000000..363f095 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_1.asm @@ -0,0 +1,126 @@ +dnl Power9 mpn_mul_1. + +dnl Copyright 2017, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 ? +C POWER7 ? +C POWER8 ? +C POWER9 2.47 + +C TODO +C * Schedule for Power9 pipeline. +C * Unroll 4x if that proves beneficial. +C * This is marginally faster (but much smaller) than ../mul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ASM_START() +PROLOGUE(mpn_mul_1c) + b L(ent) +EPILOGUE() +PROLOGUE(mpn_mul_1) + li r7, 0 +L(ent): ld r11, 0(up) + cmpdi cr6, n, 2 + addi r0, n, -1 C FIXME: postpone + srdi r0, r0, 1 C FIXME: postpone + mtctr r0 C FIXME: postpone + rldicl. r12, n, 0,63 C r0 = n & 3, set cr0 + bne cr0, L(b1) + +L(b0): ld r0, 8(up) + maddld( r9, r11, v0, r7) + maddhdu(r7, r11, v0, r7) + ble cr6, L(2) + ld r12, 16(up) + mulld r8, r0, v0 + mulhdu r5, r0, v0 + addic up, up, 16 + addi rp, rp, -8 + b L(mid) + +L(b1): ld r0, 0(up) + ble cr6, L(1) + ld r12, 8(up) + maddld( r8, r11, v0, r7) + maddhdu(r5, r11, v0, r7) + ld r0, 16(up) + mulld r9, r12, v0 + mulhdu r7, r12, v0 + addic up, up, 24 + bdz L(end) + + ALIGN(16) +L(top): ld r12, 0(up) + std r8, 0(rp) + adde r9, r5, r9 + mulld r8, r0, v0 + mulhdu r5, r0, v0 +L(mid): ld r0, 8(up) + std r9, 8(rp) + adde r8, r7, r8 + mulld r9, r12, v0 + mulhdu r7, r12, v0 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(top) + +L(end): std r8, 0(rp) + mulld r8, r0, v0 + adde r9, r5, r9 + mulhdu r5, r0, v0 + std r9, 8(rp) + adde r8, r7, r8 + std r8, 16(rp) + addze r3, r5 + blr + +L(2): mulld r8, r0, v0 + mulhdu r5, r0, v0 + std r9, 0(rp) + addc r8, r7, r8 + std r8, 8(rp) + addze r3, r5 + blr + +L(1): maddld( r8, r0, v0, r7) + std r8, 0(rp) + maddhdu(r3, r0, v0, r7) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm new file mode 100644 index 0000000..01b50a3 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_2.asm @@ -0,0 +1,181 @@ +dnl Power9 mpn_mul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C power9: 1.58 + +C STATUS +C * Not written with any power9 pipeline understanding. +C * The 4x unrolling was not motivated by any timing tests. +C * No local scheduling for performance tweaking has been done. +C * Decrease load scheduling! + +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') C Note: Reused as scratch +define(`vp', `r6') C Note: Reused for v1 + +define(`v0', `r7') +define(`v1', `r6') + + +ASM_START() +PROLOGUE(mpn_mul_2) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + subfic r0, n, 0 C clear CA + subfo r0, r0, r0 C clear OV and r0 + + cmpdi cr7, n, 4 + + ld v0, 0(vp) + ld v1, 8(vp) + + srdi r10, n, 2 + mtctr r10 + + rldicl. r9, n, 0, 63 + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 + + ld r8, 0(up) + ld r9, 8(up) + li r11, 0 + mulld r28, r8, v0 + mulhdu r31, r8, v0 + blt cr7, L(2) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + bne cr0, L(b10) + +L(b00): addi up, up, -8 + addi rp, rp, -24 + b L(lo0) + +L(b10): addi up, up, 8 + addi rp, rp, -8 + b L(lo2) + +L(2): addi rp, rp, -8 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 + + ld r9, 0(up) + ld r8, 8(up) + li r10, 0 + mulld r29, r9, v0 + mulhdu r30, r9, v0 + mulld r12, r9, v1 + mulhdu r11, r9, v1 + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + b L(lo1) +L(b11): addi up, up, 16 + blt cr7, L(end) + +L(top): ld r9, 0(up) + maddld( r28, r8, v0, r10) C 0 4 -> adde + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r29, r0 C 7 11 + std r0, 0(rp) + mulld r5, r8, v1 C 1 5 -> addex + mulhdu r10, r8, v1 C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(lo2): ld r8, 8(up) + maddld( r29, r9, v0, r11) C 1 5 -> adde + maddhdu(r30, r9, v0, r11) C 2 6 + adde r0, r28, r0 C 8 12 + std r0, 8(rp) + mulld r12, r9, v1 C 2 6 -> addex + mulhdu r11, r9, v1 C 3 7 + addex( r0, r5, r31, 0) C 5 9 13 +L(lo1): ld r9, 16(up) + maddld( r28, r8, v0, r10) C 2 6 -> adde + maddhdu(r31, r8, v0, r10) C 3 7 + adde r0, r29, r0 C 5 9 13 + std r0, 16(rp) + mulld r5, r8, v1 C 3 7 -> addex + mulhdu r10, r8, v1 C 4 8 + addex( r0, r12, r30, 0) C 6 10 +L(lo0): ld r8, 24(up) + maddld( r29, r9, v0, r11) C 3 7 -> adde + maddhdu(r30, r9, v0, r11) C 4 8 + adde r0, r28, r0 C 6 10 + std r0, 24(rp) + mulld r12, r9, v1 C 4 8 -> addex + mulhdu r11, r9, v1 C 5 9 + addex( r0, r5, r31, 0) C 7 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, v0, r10) C 0 4 + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r29, r0 C 7 11 + std r0, 0(rp) C -4 + mulld r5, r8, v1 C 1 5 + mulhdu r10, r8, v1 C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(cj2): maddld( r29, r9, v0, r11) C 1 5 -2 + maddhdu(r30, r9, v0, r11) C 2 6 -1 + adde r0, r28, r0 C 8 12 -3 + std r0, 8(rp) C -3 + mulld r12, r9, v1 C 2 6 -1 + mulhdu r11, r9, v1 C 3 7 0 = return limb + addex( r0, r5, r31, 0) C 5 9 13 + adde r0, r29, r0 C 5 9 13 -2 + std r0, 16(rp) C -2 + addex( r0, r12, r30, 0) C 6 10 -1 + adde r0, r0, r10 C -1 + std r0, 24(rp) C -1 + li r4, 0 + addze r3, r11 + addex( r3, r3, r4, 0) + +L(ret): ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm new file mode 100644 index 0000000..8f3d322 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/mul_basecase.asm @@ -0,0 +1,415 @@ +dnl Power9 mpn_mul_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 1.62 + +C TODO +C * Check if (inner) loop alignment affects performance. +C * Could we schedule loads less in addmul_2/mul_2? That would save some regs +C and make the tail code more manageable. +C * Postpone some register saves to main loop. +C * Perhaps write more small operands (3x1, 3x2, 3x3) code. +C * Consider restoring rp,up after loop using arithmetic, eliminating rp2, up2. +C On the other hand, the current rp,up restore register are useful for OSP. +C * Do OSP. This should save a lot with the current deep addmul_2 pipeline. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') +define(`vp', `r6') +define(`vn', `r7') + +define(`v0', `r0') +define(`v1', `r7') +define(`rp2', `r24') +define(`up2', `r25') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + cmpdi cr0, un, 2 + bgt cr0, L(un_gt2) + cmpdi cr6, vn, 1 + ld r7, 0(vp) + ld r5, 0(up) + mulld r8, r5, r7 C weight 0 + mulhdu r9, r5, r7 C weight 1 + std r8, 0(rp) + beq cr0, L(2x) + std r9, 8(rp) + blr + ALIGN(16) +L(2x): ld r0, 8(up) + mulld r8, r0, r7 C weight 1 + mulhdu r10, r0, r7 C weight 2 + addc r9, r9, r8 + addze r10, r10 + bne cr6, L(2x2) + std r9, 8(rp) + std r10, 16(rp) + blr + ALIGN(16) +L(2x2): ld r6, 8(vp) + mulld r8, r5, r6 C weight 1 + mulhdu r11, r5, r6 C weight 2 + addc r9, r9, r8 + std r9, 8(rp) + adde r11, r11, r10 + mulld r12, r0, r6 C weight 2 + mulhdu r0, r0, r6 C weight 3 + addze r0, r0 + addc r11, r11, r12 + addze r0, r0 + std r11, 16(rp) + std r0, 24(rp) + blr + +L(un_gt2): + std r22, -80(r1) + std r23, -72(r1) + std r24, -64(r1) + std r25, -56(r1) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + mr rp2, r3 C rp + mr up2, r4 C up + srdi r22, r5, 2 C un + subfic r23, r7, 0 C -vn, clear CA + subfo r0, r0, r0 C clear OV (and r0) + + cmpdi cr6, un, 3 + rldicl r0, un, 0, 63 C r0 = un & 1 + cmpdi cr7, r0, 0 + rldicl r0, un, 63, 63 C FIXME: unused for vn = 1 + cmpdi cr5, r0, 0 C FIXME: unused for vn = 1 + + ld v0, 0(vp) + rldicl. r9, vn, 0, 63 + beq cr0, L(vn_evn) + +L(vn_odd): + addi r10, un, -2 + ld r5, 0(up) + srdi r10, r10, 1 + mtctr r10 + bne cr7, L(m1_b1) + +L(m1_b0): + ld r10, 8(up) + mulld r9, r5, v0 + mulhdu r11, r5, v0 + ld r12, 16(up) + mulld r8, r10, v0 + mulhdu r5, r10, v0 + addi rp, rp, -8 + b L(m1_mid) + +L(m1_b1): + ld r12, 8(up) + mulld r8, r5, v0 + mulhdu r5, r5, v0 + ld r10, 16(up) + mulld r9, r12, v0 + mulhdu r11, r12, v0 + addi up, up, 8 + beq cr6, L(m1_end) C jump taken means un = 3, vn = {1,3} + + ALIGN(16) +L(m1_top): + ld r12, 16(up) + std r8, 0(rp) + adde r9, r5, r9 + mulld r8, r10, v0 + mulhdu r5, r10, v0 +L(m1_mid): + ld r10, 24(up) + std r9, 8(rp) + adde r8, r11, r8 + mulld r9, r12, v0 + mulhdu r11, r12, v0 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(m1_top) + +L(m1_end): + std r8, 0(rp) + mulld r8, r10, v0 + adde r9, r5, r9 + mulhdu r5, r10, v0 + std r9, 8(rp) + adde r8, r11, r8 + std r8, 16(rp) + addze r10, r5 + std r10, 24(rp) + + addi rp2, rp2, 8 + addi vp, vp, 8 + addic. r23, r23, 1 + b L(do_outer) + +L(vn_evn): + ld v1, 8(vp) + addi r23, r23, 2 + mtctr r22 + bne cr7, L(m2_bx1) + +L(m2_bx0): + ld r8, 0(up) + ld r9, 8(up) + li r11, 0 + mulld r28, r8, v0 + mulhdu r31, r8, v0 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + li r12, 0 + bne cr5, L(m2_b10) + +L(m2_b00): + addi up, up, -8 + addi rp, rp, -24 + b L(m2_lo0) + +L(m2_b10): + addi up, up, 8 + addi rp, rp, -8 + b L(m2_lo2) + +L(m2_bx1): + ld r9, 0(up) + ld r8, 8(up) + li r10, 0 + mulld r29, r9, v0 + mulhdu r30, r9, v0 + mulld r12, r9, v1 + mulhdu r11, r9, v1 + li r5, 0 + bne cr5, L(m2_b11) + +L(m2_b01): + addi rp, rp, -16 + b L(m2_lo1) +L(m2_b11): + addi up, up, 16 + beq cr6, L(m2_end) C taken means un = 3, vn = 2. We're done. + +L(m2_top): + ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + addex( r12, r12, r30, 0) +L(m2_lo2): + ld r8, 8(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) +L(m2_lo1): + ld r9, 16(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 16(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + addex( r12, r12, r30, 0) +L(m2_lo0): + ld r8, 24(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 24(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(m2_top) + +L(m2_end): + ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj) + +L(outer): + ld v0, 0(vp) + ld v1, 8(vp) + addi r23, r23, 2 + mtctr r22 + bne cr7, L(bx1) + +L(bx0): ld r26, 0(rp2) + ld r8, 0(up2) + ld r11, 8(rp2) + ld r9, 8(up2) + maddld( r28, r8, v0, r26) + maddhdu(r31, r8, v0, r26) + ld r26, 16(rp2) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + li r12, 0 + bne cr5, L(b10) + +L(b00): addi up, up2, -8 + addi rp, rp2, -24 + b L(lo0) + +L(b10): addi up, up2, 8 + addi rp, rp2, -8 + b L(lo2) + +L(bx1): ld r27, 0(rp2) + ld r9, 0(up2) + ld r10, 8(rp2) + ld r8, 8(up2) + maddld( r29, r9, v0, r27) + maddhdu(r30, r9, v0, r27) + ld r27, 16(rp2) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + li r5, 0 + bne cr5, L(b11) + +L(b01): addi up, up2, 0 + addi rp, rp2, -16 + b L(lo1) +L(b11): addi up, up2, 16 + addi rp, rp2, 0 + beq cr6, L(end) C taken means un = 3, vn = 3. We're done. + +L(top): ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + ld r26, 24(rp) + std r5, 0(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) + addex( r12, r12, r30, 0) +L(lo2): ld r8, 8(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + ld r27, 32(rp) + std r12, 8(rp) + maddld( r12, r9, v1, r26) + maddhdu(r11, r9, v1, r26) + addex( r5, r5, r31, 0) +L(lo1): ld r9, 16(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + ld r26, 40(rp) + std r5, 16(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) + addex( r12, r12, r30, 0) +L(lo0): ld r8, 24(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + ld r27, 48(rp) + std r12, 24(rp) + maddld( r12, r9, v1, r26) + maddhdu(r11, r9, v1, r26) + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) +L(cj): addex( r12, r12, r30, 0) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 16(rp) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 24(rp) + li r4, 0 + addze r5, r11 + addex( r5, r5, r4, 0) + std r5, 32(rp) + + cmpdi cr0, r23, 0 + addi rp2, rp2, 16 + addi vp, vp, 16 +L(do_outer): + bne cr0, L(outer) +L(ret): + ld r22, -80(r1) + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm new file mode 100644 index 0000000..2d4fa63 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/p9/sqr_basecase.asm @@ -0,0 +1,555 @@ +dnl Power9 mpn_sqr_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 1.62 + +C TODO +C * Completely separate evn and odd code into two outer loops. Also consider +C unrolling these two outer loops and thereby eliminate all branches. +C * Avoid the reloading of u1 before every loop start. +C * Reduce register usage. +C * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde. +C * Consider skewing conditional adjustments to allow mask creation with subfe +C like in the un=3 code. It might streamline the adjustments (or not). + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') + +define(`u0', `r0') +define(`u1', `r7') +define(`rp2', `r24') +define(`up2', `r25') +define(`cy', `r6') + +define(`LSHU1U0',` + addc u0, u0, u0 + adde u1, u1, u1 + li cy, 0 + addze cy, cy +') +define(`LSHU1U',` + addc u0, u0, u0 + add u0, u0, cy + adde u1, u1, u1 + li cy, 0 + addze cy, cy +') +define(`LSHU1UF',` + addc u0, u0, u0 + add u0, u0, cy + adde u1, u1, u1 +') +define(`LSHU1UHF',` + add u0, u0, u0 + add u0, u0, cy +') +C These are cleverer replacements, but they tend to leave CA set, disturbing +C the main accumulation code! Breaking that false dependency might have a +C positive performance impact. Note that the subfe here results in a mask for +C our adjustments. +define(`xLSHU1U0',` + addc u0, u0, u0 + adde u1, u1, u1 + subfe cy, cy, cy +') +define(`xLSHU1U',` + subfic cy, cy, 0 + adde u0, u0, u0 + adde u1, u1, u1 + subfe cy, cy, cy +') +define(`xLSHU1U',` + subfic cy, cy, 0 + adde u0, u0, u0 +') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + ld r0, 0(up) C n = 1 + mulld r8, r0, r0 C weight 0 + mulhdu r9, r0, r0 C weight 1 + std r8, 0(rp) + cmpdi cr0, un, 2 + bge cr0, L(ge2) + std r9, 8(rp) + blr + +L(ge2): bgt cr0, L(gt2) + ld r6, 8(up) + mulld r10, r6, r6 C u1 * u1 + mulhdu r11, r6, r6 C u1 * u1 + mulld r4, r6, r0 C u1 * u0 + mulhdu r5, r6, r0 C u1 * u0 + addc r4, r4, r4 + adde r5, r5, r5 + addze r11, r11 + addc r9, r9, r4 + adde r10, r10, r5 + addze r11, r11 + std r9, 8(rp) + std r10, 16(rp) + std r11, 24(rp) + blr + +L(gt2): cmpdi cr0, un, 3 + bgt cr0, L(gt3) + std r30, -16(r1) + std r31, -8(r1) + subfo r12, r12, r12 C clear OV (and result register) + ld r8, 8(r4) + mulld r5, r8, r8 C W2 + mulhdu r10, r8, r8 C W3 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, r8 C W3 + addc u0, u0, u0 + adde u1, r8, r8 + subfe r6, r6, r6 C mask + ld r4, 16(r4) C W2 + mulld r12, r8, u0 C W1 u1 x u0 + mulhdu r8, r8, u0 C W2 u1 x u0 + maddld( r31, r4, u0, r11) C W2 + maddhdu(r30, r4, u0, r11) C W3 + andc r6, r4, r6 C W4 + addc r9, r12, r9 C W1 + std r9, 8(rp) C W1 + mulld r9, r4, u1 C W3 + mulhdu r11, r4, u1 C W4 + addex( r5, r5, r8, 0) C W2 + adde r5, r31, r5 C W2 + std r5, 16(rp) C W2 + maddld( r5, r4, r4, r6) C W4 u2^2 + maddhdu(r6, r4, r4, r6) C W5 u2^2 + addex( r9, r9, r30, 0) C W3 + adde r9, r9, r10 C W3 + std r9, 24(rp) C W3 + adde r5, r5, r11 C W4 + addze r6, r6 C W5 + li r8, 0 + addex( r5, r5, r8, 0) C W4 + std r5, 32(rp) C W4 + addex( r6, r6, r8, 0) C W5 + std r6, 40(rp) C W5 + ld r30, -16(r1) + ld r31, -8(r1) + blr + +L(gt3): std r22, -80(r1) + std r23, -72(r1) + std r24, -64(r1) + std r25, -56(r1) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + mr rp2, rp + mr up2, up + addi r22, un, -1 C count for loop FIXME: Adjust + subfo r0, r0, r0 C clear OV (and r0) + rldicl r0, un, 0, 63 C r0 = un & 1 + cmpdi cr7, r0, 0 + + ld u0, 0(up2) + ld u1, 8(up2) + + cmpdi cr5, r22, 4 + srdi r31, r22, 2 + addi r22, r22, -2 + mtctr r31 + + beq cr7, L(m2_evn) +L(m2_odd): + rldicl. r31, r22, 63, 63 C r22 & 2 + mulld r23, u0, u0 + mulhdu r12, u0, u0 + mulld r5, u1, u1 + mulhdu r10, u1, u1 + + sradi r11, u0, 63 + and r11, r11, u1 + + LSHU1U0 + + ld r8, 8(up2) + ld r9, 16(up2) + mulld r28, r8, u0 C W u1 x u0 + mulhdu r31, r8, u0 C W u1 x u0 + std r23, 0(rp2) + + bne cr0, L(m2_11) +L(m2_01): + addi up, up2, 16 + addi rp, rp2, 0 + b L(m2_lo2) +L(m2_11): + addi up, up2, 0 + addi rp, rp2, -16 + b L(m2_lo0) + +L(m2_evn): + rldicl. r31, r22, 63, 63 C r22 & 2 + mulld r23, u0, u0 + mulhdu r5, u0, u0 + mulld r12, u1, u1 + mulhdu r11, u1, u1 + + sradi r10, u0, 63 + and r10, r10, u1 + + LSHU1U0 + + ld r9, 8(up2) + ld r8, 16(up2) + mulld r29, r9, u0 C W u1 x u0 + mulhdu r30, r9, u0 C W u1 x u0 + std r23, 0(rp2) + + beq cr0, L(m2_10) +L(m2_00): + addi up, up2, 8 + addi rp, rp2, -8 + b L(m2_lo1) +L(m2_10): + addi up, up2, 24 + addi rp, rp2, 8 + ble cr5, L(m2_end) + +L(m2_top): + ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + addex( r12, r12, r30, 0) +L(m2_lo2): + ld r8, 8(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) +L(m2_lo1): + ld r9, 16(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 16(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + addex( r12, r12, r30, 0) +L(m2_lo0): + ld r8, 24(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 24(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(m2_top) + +L(m2_end): + ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + b L(cj) C jump to addmul_2 tail + +L(outer): + addi up2, up2, 16 + addi rp2, rp2, 32 + + ld u0, 0(up2) + ld u1, 8(up2) + + cmpdi cr5, r22, 4 + srdi r31, r22, 2 + addi r22, r22, -2 + mtctr r31 + + ld r26, 0(rp2) + ld r27, 16(rp2) + + rldicl. r31, r22, 63, 63 C r22 & 2 + beq cr7, L(evn) + +L(odd): maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r12, u0, u0, r26) C W u2^2 + maddld( r5, u1, u1, r27) C W u3^2 + maddhdu(r10, u1, u1, r27) C W u3^2 + ld r26, 8(rp2) + + ld r8, -8(up2) + sradi r8, r8, 63 C CAUTION: clobbers CA + and r8, r8, u0 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, u1 + + LSHU1U + + addc r23, r23, r8 + + ld r8, 8(up2) + ld r9, 16(up2) + maddld( r28, r8, u0, r26) C W u3 x u2 + maddhdu(r31, r8, u0, r26) C W u3 x u2 + ld r26, 24(rp2) + std r23, 0(rp2) C W0 + + bne cr0, L(11) +L(01): + addi up, up2, 16 + addi rp, rp2, 0 + b L(lo2) +L(11): + addi up, up2, 0 + addi rp, rp2, -16 + b L(lo0) + +L(evn): maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r5, u0, u0, r26) C W u2^2 + maddld( r12, u1, u1, r27) C W u3^2 + maddhdu(r11, u1, u1, r27) C W u3^2 + ld r27, 8(rp2) + + ld r9, -8(up2) + sradi r9, r9, 63 C CAUTION: clobbers CA + and r9, r9, u0 + sradi r10, u0, 63 C CAUTION: clobbers CA + and r10, r10, u1 + + LSHU1U + + addc r23, r23, r9 + + ld r9, 8(up2) + ld r8, 16(up2) + maddld( r29, r9, u0, r27) C W u3 x u2 + maddhdu(r30, r9, u0, r27) C W u3 x u2 + ld r27, 24(rp2) + std r23, 0(rp2) C W0 + + beq cr0, L(10) +L(00): + addi up, up2, 8 + addi rp, rp2, -8 + b L(lo1) +L(10): + addi up, up2, 24 + addi rp, rp2, 8 + ble cr5, L(end) + +L(top): ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + ld r26, 24(rp) + std r5, 0(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) + addex( r12, r12, r30, 0) +L(lo2): ld r8, 8(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + ld r27, 32(rp) + std r12, 8(rp) + maddld( r12, r9, u1, r26) + maddhdu(r11, r9, u1, r26) + addex( r5, r5, r31, 0) +L(lo1): ld r9, 16(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + ld r26, 40(rp) + std r5, 16(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) + addex( r12, r12, r30, 0) +L(lo0): ld r8, 24(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + ld r27, 48(rp) + std r12, 24(rp) + maddld( r12, r9, u1, r26) + maddhdu(r11, r9, u1, r26) + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) +L(cj): addex( r12, r12, r30, 0) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 16(rp) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 24(rp) + li r4, 0 + addze r5, r11 + addex( r5, r5, r4, 0) + std r5, 32(rp) + bgt cr5, L(outer) + +L(corner): + ld u0, 16(up2) + ld u1, 24(up2) + ld r26, 32(rp2) + bne cr7, L(corner_odd) + +L(corner_evn): + ld r27, 40(rp2) + maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r5, u0, u0, r26) C W u2^2 + mulld r12, u1, u1 C W u3^2 + mulhdu r11, u1, u1 C W u3^2 + + ld r9, 8(up2) + sradi r9, r9, 63 C CAUTION: clobbers CA + and r9, r9, u0 + sradi r10, u0, 63 C CAUTION: clobbers CA + and r10, r10, u1 + + LSHU1UHF + + addc r23, r23, r9 + + ld r9, 24(up2) + maddld( r29, r9, u0, r27) C W u3 x u2 + maddhdu(r30, r9, u0, r27) C W u3 x u2 + std r23, 32(rp2) + adde r5, r29, r5 + std r5, 40(rp2) + addex( r12, r12, r30, 0) + adde r12, r12, r10 C W FIXME can this co? + std r12, 48(rp2) + li r4, 0 + addex( r5, r11, r4, 0) + addze r5, r5 + std r5, 56(rp2) + b L(ret) + +L(corner_odd): + ld r27, 48(rp2) + maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r12, u0, u0, r26) C W u2^2 + maddld( r5, u1, u1, r27) C W u3^2 + maddhdu(r10, u1, u1, r27) C W u3^2 + ld r26, 40(rp2) + + ld r8, 8(up2) + sradi r8, r8, 63 C CAUTION: clobbers CA + and r8, r8, u0 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, u1 + + LSHU1UF + + addc r23, r23, r8 + + ld r8, 24(up2) + ld r9, 32(up2) + maddld( r28, r8, u0, r26) C W u3 x u2 + maddhdu(r31, r8, u0, r26) C W u3 x u2 + std r23, 32(rp2) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 40(rp2) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 48(rp2) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 56(rp2) + mulld r23, r9, r9 C W u2^2 + mulhdu r12, r9, r9 C W u2^2 + adde r23, r23, r11 + addze r12, r12 + sradi r4, r8, 63 C CAUTION: clobbers CA + and r4, r4, r9 + addex( r23, r23, r4, 0) + std r23, 64(rp2) + li r4, 0 + addex( r12, r12, r4, 0) + std r12, 72(rp2) + +L(ret): ld r22, -80(r1) + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm b/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm new file mode 100644 index 0000000..1f57bdf --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/rsh1aors_n.asm @@ -0,0 +1,173 @@ +dnl PowerPC-64 mpn_rsh1add_n, mpn_rsh1sub_n + +dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 2.9 +C POWER5 ? +C POWER6 3.5 +C POWER7 2.25 + +define(`rp', `r3') +define(`up', `r4') +define(`vp', `r5') +define(`n', `r6') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUBC', `addc') + define(`ADDSUBE', `adde') + define(INITCY, `addic $1, r1, 0') + define(`func', mpn_rsh1add_n)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUBC', `subfc') + define(`ADDSUBE', `subfe') + define(INITCY, `addic $1, r1, -1') + define(`func', mpn_rsh1sub_n)') + +define(`s0', `r9') +define(`s1', `r7') +define(`x0', `r0') +define(`x1', `r12') +define(`u0', `r8') +define(`v0', `r10') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + ld u0, 0(up) + ld v0, 0(vp) + + cmpdi cr6, n, 2 + + addi r0, n, 1 + srdi r0, r0, 2 + mtctr r0 C copy size to count register + + andi. r0, n, 1 + bne cr0, L(bx1) + +L(bx0): ADDSUBC x1, v0, u0 + ld u0, 8(up) + ld v0, 8(vp) + ADDSUBE x0, v0, u0 + ble cr6, L(n2) + ld u0, 16(up) + ld v0, 16(vp) + srdi s0, x1, 1 + rldicl r11, x1, 0, 63 C return value + ADDSUBE x1, v0, u0 + andi. n, n, 2 + bne cr0, L(b10) +L(b00): addi rp, rp, -24 + b L(lo0) +L(b10): addi up, up, 16 + addi vp, vp, 16 + addi rp, rp, -8 + b L(lo2) + + ALIGN(16) +L(bx1): ADDSUBC x0, v0, u0 + ble cr6, L(n1) + ld u0, 8(up) + ld v0, 8(vp) + ADDSUBE x1, v0, u0 + ld u0, 16(up) + ld v0, 16(vp) + srdi s1, x0, 1 + rldicl r11, x0, 0, 63 C return value + ADDSUBE x0, v0, u0 + andi. n, n, 2 + bne cr0, L(b11) +L(b01): addi up, up, 8 + addi vp, vp, 8 + addi rp, rp, -16 + b L(lo1) +L(b11): addi up, up, 24 + addi vp, vp, 24 + bdz L(end) + + ALIGN(32) +L(top): ld u0, 0(up) + ld v0, 0(vp) + srdi s0, x1, 1 + rldimi s1, x1, 63, 0 + std s1, 0(rp) + ADDSUBE x1, v0, u0 +L(lo2): ld u0, 8(up) + ld v0, 8(vp) + srdi s1, x0, 1 + rldimi s0, x0, 63, 0 + std s0, 8(rp) + ADDSUBE x0, v0, u0 +L(lo1): ld u0, 16(up) + ld v0, 16(vp) + srdi s0, x1, 1 + rldimi s1, x1, 63, 0 + std s1, 16(rp) + ADDSUBE x1, v0, u0 +L(lo0): ld u0, 24(up) + ld v0, 24(vp) + srdi s1, x0, 1 + rldimi s0, x0, 63, 0 + std s0, 24(rp) + ADDSUBE x0, v0, u0 + addi up, up, 32 + addi vp, vp, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): srdi s0, x1, 1 + rldimi s1, x1, 63, 0 + std s1, 0(rp) +L(cj2): srdi s1, x0, 1 + rldimi s0, x0, 63, 0 + std s0, 8(rp) +L(cj1): ADDSUBE x1, x1, x1 C pseudo-depends on x1 + rldimi s1, x1, 63, 0 + std s1, 16(rp) + mr r3, r11 + blr + +L(n1): srdi s1, x0, 1 + rldicl r11, x0, 0, 63 C return value + ADDSUBE x1, x1, x1 C pseudo-depends on x1 + rldimi s1, x1, 63, 0 + std s1, 0(rp) + mr r3, r11 + blr + +L(n2): addi rp, rp, -8 + srdi s0, x1, 1 + rldicl r11, x1, 0, 63 C return value + b L(cj2) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm b/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm new file mode 100644 index 0000000..e76bb88 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/mode64/sqr_basecase.asm @@ -0,0 +1,863 @@ +dnl PowerPC-64 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 6-18 +C POWER4/PPC970 8 +C POWER5 8 +C POWER6 16.25 +C POWER7 3.77 + +C NOTES +C * This is very crude, cleanup! +C * Try to reduce the number of needed live registers. +C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The +C cost will be more live registers. +C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code +C size a lot and speed things up perhaps 25%. +C * Use computed goto in order to compress the code. +C * Implement a larger final corner. +C * Schedule callee-saves register saves into other insns. This could save +C about 5 cycles/call. (We cannot analogously optimise the restores, since +C the sqr_diag_addlsh1 loop has no wind-down code as currently written.) +C * Should the alternating std/adde sequences be split? Some pipelines handle +C adde poorly, and might sequentialise all these instructions. +C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for +C adjacent integer multiply insns. Except for the multiply insns, the code +C was not carefully optimised for POWER6 or any other CPU. +C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +define(`rp_outer', `r25') +define(`up_outer', `r21') +define(`rp_saved', `r22') +define(`up_saved', `r23') +define(`n_saved', `r24') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + cmpdi cr0, n, 2 + bge cr0, L(ge2) + ld r5, 0(up) C n = 1 + nop + mulld r8, r5, r5 C weight 0 + mulhdu r9, r5, r5 C weight 1 + std r8, 0(rp) + std r9, 8(rp) + blr + ALIGN(16) +L(ge2): bgt cr0, L(gt2) + ld r0, 0(up) C n = 2 + nop + mulld r8, r0, r0 C u0 * u0 + mulhdu r9, r0, r0 C u0 * u0 + ld r6, 8(up) + mulld r10, r6, r6 C u1 * u1 + mulhdu r11, r6, r6 C u1 * u1 + mulld r4, r6, r0 C u1 * u0 + mulhdu r5, r6, r0 C u1 * u0 + addc r4, r4, r4 + adde r5, r5, r5 + addze r11, r11 + addc r9, r9, r4 + adde r10, r10, r5 + addze r11, r11 + std r8, 0(rp) + std r9, 8(rp) + std r10, 16(rp) + std r11, 24(rp) + blr + + ALIGN(16) +L(gt2): std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + std r26, -48(r1) + std r25, -56(r1) + std r24, -64(r1) + std r23, -72(r1) + std r22, -80(r1) + std r21, -88(r1) + + mr rp_saved, rp + mr up_saved, up + mr n_saved, n + mr rp_outer, rp + mr up_outer, up + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addic r7, n, 2 C compute count... + srdi r7, r7, 2 C ...for ctr + mtctr r7 C copy count into ctr + beq- cr0, L(b0) + blt- cr6, L(b1) + beq- cr6, L(b2) + +L(b3): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + addi up, up, 24 + li r12, 0 C carry limb + bdz L(em3) + + ALIGN(16) +L(tm3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm3) + +L(em3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop) + +L(b0): ld r6, 0(up) + ld r27, 8(up) + mulld r7, r27, r6 + mulhdu r12, r27, r6 + std r7, 8(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(em0) + + ALIGN(16) +L(tm0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm0) + +L(em0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_2) + +L(b1): ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r12, r27, r6 + addc r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(em1) + + ALIGN(16) +L(tm1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm1) + +L(em1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_3) + +L(b2): addi r7, r7, -1 C FIXME + mtctr r7 C FIXME + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + std r0, 8(rp) + std r7, 16(rp) + std r11, 24(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(em2) + + ALIGN(16) +L(tm2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 0(up) + ld r27, 8(up) + adde r0, r0, r12 + adde r7, r7, r26 + mulld r26, r9, r6 + mulhdu r10, r9, r6 + mulld r11, r27, r6 + mulhdu r12, r27, r6 + ld r9, 16(up) + ld r27, 24(up) + std r0, 8(rp) + adde r26, r26, r8 + std r7, 16(rp) + adde r11, r11, r10 + std r26, 24(rp) + addi up, up, 32 + std r11, 32(rp) + addi rp, rp, 32 + bdnz L(tm2) + +L(em2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + adde r0, r0, r12 + adde r7, r7, r26 + std r0, 8(rp) + std r7, 16(rp) + addze r8, r8 + std r8, 24(rp) + addi n, n, 2 + b L(outer_loop_ent_0) + + +L(outer_loop): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + bdz L(outer_end) + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r9, 24(up) + ld r28, 0(rp) + ld r29, 8(rp) + ld r30, 16(rp) + mulld r11, r9, r6 + mulhdu r10, r9, r6 + addc r7, r7, r26 + adde r11, r11, r8 + addze r12, r10 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + adde r11, r11, r30 + std r11, 16(rp) + addi rp, rp, 24 + ld r9, 32(up) + ld r27, 40(up) + addi up, up, 48 + bdz L(ea1) + + ALIGN(16) +L(ta1): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta1) + +L(ea1): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_0): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + ld r28, 0(rp) + ld r29, 8(rp) + mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + addc r0, r0, r28 + adde r7, r7, r26 + addze r12, r8 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addi rp, rp, 16 + ld r9, 24(up) + ld r27, 32(up) + addi up, up, 40 + bdz L(ea0) + + ALIGN(16) +L(ta0): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta0) + +L(ea0): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + +L(outer_loop_ent_3): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + ld r6, 0(up) + ld r9, 8(up) + ld r28, 0(rp) + mulld r0, r9, r6 + mulhdu r12, r9, r6 + addc r0, r0, r28 + std r0, 0(rp) + addi rp, rp, 8 + ld r9, 16(up) + ld r27, 24(up) + addi up, up, 32 + bdz L(ea3) + + ALIGN(16) +L(ta3): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta3) + +L(ea3): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + +L(outer_loop_ent_2): + addi n, n, -1 + addi up_outer, up_outer, 8 + addi rp_outer, rp_outer, 16 + + mr up, up_outer + addi rp, rp_outer, 8 + + srdi r0, n, 2 + mtctr r0 + + addic r0, r0, 0 + li r12, 0 C cy_limb = 0 + ld r6, 0(up) + ld r9, 8(up) + ld r27, 16(up) + bdz L(ea2) + addi up, up, 24 + + ALIGN(16) +L(ta2): mulld r0, r9, r6 + mulhdu r26, r9, r6 C 9 + mulld r7, r27, r6 + mulhdu r8, r27, r6 C 27 + ld r9, 0(up) + ld r28, 0(rp) + ld r27, 8(up) + ld r29, 8(rp) + adde r0, r0, r12 C 0 12 + adde r7, r7, r26 C 5 7 + mulld r26, r9, r6 + mulhdu r10, r9, r6 C 9 + mulld r11, r27, r6 + mulhdu r12, r27, r6 C 27 + ld r9, 16(up) + ld r30, 16(rp) + ld r27, 24(up) + ld r31, 24(rp) + adde r26, r26, r8 C 8 5 + adde r11, r11, r10 C 10 11 + addze r12, r12 C 12 + addc r0, r0, r28 C 0 28 + std r0, 0(rp) C 0 + adde r7, r7, r29 C 7 29 + std r7, 8(rp) C 7 + adde r26, r26, r30 C 5 30 + std r26, 16(rp) C 5 + adde r11, r11, r31 C 11 31 + std r11, 24(rp) C 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(ta2) + +L(ea2): mulld r0, r9, r6 + mulhdu r26, r9, r6 + mulld r7, r27, r6 + mulhdu r8, r27, r6 + ld r28, 0(rp) + ld r29, 8(rp) + adde r0, r0, r12 + adde r7, r7, r26 + addze r8, r8 + addc r0, r0, r28 + std r0, 0(rp) + adde r7, r7, r29 + std r7, 8(rp) + addze r8, r8 + std r8, 16(rp) + + b L(outer_loop) + +L(outer_end): + ld r6, 0(up) + ld r9, 8(up) + ld r11, 0(rp) + mulld r0, r9, r6 + mulhdu r8, r9, r6 + addc r0, r0, r11 + std r0, 0(rp) + addze r8, r8 + std r8, 8(rp) + +define(`rp', `rp_saved') +define(`up', `r5') +define(`n', `r6') +define(`climb', `r0') + + addi r4, rp_saved, 8 + mr r5, up_saved + mr r6, n_saved + + rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 + cmpdi cr6, r0, 2 + addi n, n, 2 C compute count... + srdi n, n, 2 C ...for ctr + mtctr n C put loop count into ctr + beq cr0, L(xb0) + blt cr6, L(xb1) + beq cr6, L(xb2) + +L(xb3): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + addi up, up, 24 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + addze climb, r29 + addc r10, r10, r25 + adde r11, r11, r26 + adde r6, r6, r27 + adde r7, r7, r28 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + std r6, 24(rp) + std r7, 32(rp) + addi rp, rp, 40 + bdnz L(top) + b L(end) + +L(xb2): ld r6, 0(up) + ld r7, 8(up) + addi up, up, 16 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + ld r10, 8(rp) + ld r11, 16(rp) + addc r10, r10, r10 + adde r11, r11, r11 + addze climb, r27 + addc r10, r10, r25 + adde r11, r11, r26 + std r24, 0(rp) + std r10, 8(rp) + std r11, 16(rp) + addi rp, rp, 24 + bdnz L(top) + b L(end) + +L(xb0): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r10, 8(rp) + ld r11, 16(rp) + ld r6, 24(rp) + ld r7, 32(rp) + ld r12, 40(rp) + ld r23, 48(rp) + addc r10, r10, r10 + adde r11, r11, r11 + adde r6, r6, r6 + adde r7, r7, r7 + adde r12, r12, r12 + adde r23, r23, r23 + addze climb, r31 + std r24, 0(rp) + addc r10, r10, r25 + std r10, 8(rp) + adde r11, r11, r26 + std r11, 16(rp) + adde r6, r6, r27 + std r6, 24(rp) + adde r7, r7, r28 + std r7, 32(rp) + adde r12, r12, r29 + std r12, 40(rp) + adde r23, r23, r30 + std r23, 48(rp) + addi rp, rp, 56 + bdnz L(top) + b L(end) + +L(xb1): ld r6, 0(up) + addi up, up, 8 + mulld r24, r6, r6 + mulhdu climb, r6, r6 + std r24, 0(rp) + addic rp, rp, 8 C clear carry as side-effect + + ALIGN(32) +L(top): ld r6, 0(up) + ld r7, 8(up) + ld r12, 16(up) + ld r23, 24(up) + addi up, up, 32 + mulld r24, r6, r6 + mulhdu r25, r6, r6 + mulld r26, r7, r7 + mulhdu r27, r7, r7 + mulld r28, r12, r12 + mulhdu r29, r12, r12 + mulld r30, r23, r23 + mulhdu r31, r23, r23 + ld r8, 0(rp) + ld r9, 8(rp) + adde r8, r8, r8 + adde r9, r9, r9 + ld r10, 16(rp) + ld r11, 24(rp) + adde r10, r10, r10 + adde r11, r11, r11 + ld r6, 32(rp) + ld r7, 40(rp) + adde r6, r6, r6 + adde r7, r7, r7 + ld r12, 48(rp) + ld r23, 56(rp) + adde r12, r12, r12 + adde r23, r23, r23 + addze r31, r31 + addc r8, r8, climb + std r8, 0(rp) + adde r9, r9, r24 + std r9, 8(rp) + adde r10, r10, r25 + std r10, 16(rp) + adde r11, r11, r26 + std r11, 24(rp) + adde r6, r6, r27 + std r6, 32(rp) + adde r7, r7, r28 + std r7, 40(rp) + adde r12, r12, r29 + std r12, 48(rp) + adde r23, r23, r30 + std r23, 56(rp) + mr climb, r31 + addi rp, rp, 64 + bdnz L(top) + +L(end): addze climb, climb + std climb, 0(rp) + + ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + ld r26, -48(r1) + ld r25, -56(r1) + ld r24, -64(r1) + ld r23, -72(r1) + ld r22, -80(r1) + ld r21, -88(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/p6/lshift.asm b/gmp-6.3.0/mpn/powerpc64/p6/lshift.asm new file mode 100644 index 0000000..1a200fb --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/p6/lshift.asm @@ -0,0 +1,132 @@ +dnl PowerPC-64 mpn_lshift -- rp[] = up[] << cnt + +dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 4 + +C TODO +C * Micro-optimise header code +C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4236 +C bytes, 4-way code would become about 50% larger. + +C INPUT PARAMETERS +define(`rp_param', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cnt', `r6') + +define(`tnc',`r0') +define(`retval',`r3') +define(`rp', `r7') + +ASM_START() +PROLOGUE(mpn_lshift,toc) + +ifdef(`HAVE_ABI_mode32',` + rldicl n, n, 0,32 C FIXME: avoid this zero extend +') + mflr r12 + sldi r8, n, 3 + sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block + LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1) + add up, up, r8 C make up point at end of up[] + add r11, r11, r10 C address of L(oN) for N = cnt + srdi r10, n, 1 + add rp, rp_param, r8 C make rp point at end of rp[] + subfic tnc, cnt, 64 + rlwinm. r8, n, 0,31,31 C extract bit 0 + mtctr r10 + beq L(evn) + +L(odd): ld r9, -8(up) + cmpdi cr0, n, 1 C n = 1? + beq L(1) + ld r8, -16(up) + addi r11, r11, -84 C L(o1) - L(e1) - 64 + mtlr r11 + srd r3, r9, tnc C retval + addi up, up, 8 + addi rp, rp, -8 + blr C branch to L(oN) + +L(evn): ld r8, -8(up) + ld r9, -16(up) + addi r11, r11, -64 + mtlr r11 + srd r3, r8, tnc C retval + blr C branch to L(eN) + +L(1): srd r3, r9, tnc C retval + sld r8, r9, cnt + std r8, -8(rp) + mtlr r12 +ifdef(`HAVE_ABI_mode32', +` mr r4, r3 + srdi r3, r3, 32 +') + blr + + +define(SHIFT,` +L(lo$1):ld r8, -24(up) + std r11, -8(rp) + addi rp, rp, -16 +L(o$1): srdi r10, r8, eval(64-$1) + rldimi r10, r9, $1, 0 + ld r9, -32(up) + addi up, up, -16 + std r10, 0(rp) +L(e$1): srdi r11, r9, eval(64-$1) + rldimi r11, r8, $1, 0 + bdnz L(lo$1) + std r11, -8(rp) + sldi r10, r9, $1 + b L(com) + nop + nop +') + + ALIGN(64) +forloop(`i',1,63,`SHIFT(i)') + +L(com): std r10, -16(rp) + mtlr r12 +ifdef(`HAVE_ABI_mode32', +` mr r4, r3 + srdi r3, r3, 32 +') + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/p6/lshiftc.asm b/gmp-6.3.0/mpn/powerpc64/p6/lshiftc.asm new file mode 100644 index 0000000..e4b3caa --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/p6/lshiftc.asm @@ -0,0 +1,136 @@ +dnl PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt + +dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 4 + +C TODO +C * Micro-optimise header code +C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4236 +C bytes, 4-way code would become about 50% larger. + +C INPUT PARAMETERS +define(`rp_param', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cnt', `r6') + +define(`tnc',`r0') +define(`retval',`r3') +define(`rp', `r7') + +ASM_START() +PROLOGUE(mpn_lshiftc,toc) + +ifdef(`HAVE_ABI_mode32',` + rldicl n, n, 0,32 C FIXME: avoid this zero extend +') + mflr r12 + sldi r8, n, 3 + sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block + LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1) + add up, up, r8 C make up point at end of up[] + add r11, r11, r10 C address of L(oN) for N = cnt + srdi r10, n, 1 + add rp, rp_param, r8 C make rp point at end of rp[] + subfic tnc, cnt, 64 + rlwinm. r8, n, 0,31,31 C extract bit 0 + mtctr r10 + beq L(evn) + +L(odd): ld r9, -8(up) + cmpdi cr0, n, 1 C n = 1? + beq L(1) + ld r8, -16(up) + addi r11, r11, -88 C L(o1) - L(e1) - 64 + mtlr r11 + srd r3, r9, tnc C retval + addi up, up, 8 + addi rp, rp, -8 + blr C branch to L(oN) + +L(evn): ld r8, -8(up) + ld r9, -16(up) + addi r11, r11, -64 + mtlr r11 + srd r3, r8, tnc C retval + blr C branch to L(eN) + +L(1): srd r3, r9, tnc C retval + sld r8, r9, cnt + nor r8, r8, r8 + std r8, -8(rp) + mtlr r12 +ifdef(`HAVE_ABI_mode32', +` mr r4, r3 + srdi r3, r3, 32 +') + blr + + +define(SHIFT,` +L(lo$1):ld r8, -24(up) + nor r11, r11, r11 + std r11, -8(rp) + addi rp, rp, -16 +L(o$1): srdi r10, r8, eval(64-$1) + rldimi r10, r9, $1, 0 + ld r9, -32(up) + addi up, up, -16 + nor r10, r10, r10 + std r10, 0(rp) +L(e$1): srdi r11, r9, eval(64-$1) + rldimi r11, r8, $1, 0 + bdnz L(lo$1) + sldi r10, r9, $1 + b L(com) + nop +') + + ALIGN(64) +forloop(`i',1,63,`SHIFT(i)') + +L(com): nor r11, r11, r11 + nor r10, r10, r10 + std r11, -8(rp) + std r10, -16(rp) + mtlr r12 +ifdef(`HAVE_ABI_mode32', +` mr r4, r3 + srdi r3, r3, 32 +') + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/p6/rshift.asm b/gmp-6.3.0/mpn/powerpc64/p6/rshift.asm new file mode 100644 index 0000000..9e848c1 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/p6/rshift.asm @@ -0,0 +1,131 @@ +dnl PowerPC-64 mpn_rshift -- rp[] = up[] << cnt + +dnl Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2 +C POWER6 3.5 (mysteriously 3.0 for cnt=1) + +C TODO +C * Micro-optimise header code +C * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6. The code is 4248 +C bytes, 4-way code would become about 50% larger. + +C INPUT PARAMETERS +define(`rp_param', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cnt', `r6') + +define(`tnc',`r0') +define(`retval',`r3') +define(`rp', `r7') + +ASM_START() +PROLOGUE(mpn_rshift,toc) + +ifdef(`HAVE_ABI_mode32',` + rldicl n, n, 0,32 C FIXME: avoid this zero extend +') + mflr r12 + LEAL( r11, L(e1)) C address of L(e1) label in SHIFT(1) + sldi r10, cnt, 6 C multiply cnt by size of a SHIFT block + add r11, r11, r10 C address of L(oN) for N = cnt + srdi r10, n, 1 + mr rp, rp_param + subfic tnc, cnt, 64 + rlwinm. r8, n, 0,31,31 C extract bit 0 + mtctr r10 + beq L(evn) + +L(odd): ld r9, 0(up) + cmpdi cr0, n, 1 C n = 1? + beq L(1) + ld r8, 8(up) + addi r11, r11, -84 C L(o1) - L(e1) - 64 + mtlr r11 + sld r3, r9, tnc C retval + addi up, up, 8 + addi rp, rp, 8 + blr C branch to L(oN) + +L(evn): ld r8, 0(up) + ld r9, 8(up) + addi r11, r11, -64 + mtlr r11 + sld r3, r8, tnc C retval + addi up, up, 16 + blr C branch to L(eN) + +L(1): sld r3, r9, tnc C retval + srd r8, r9, cnt + std r8, 0(rp) + mtlr r12 +ifdef(`HAVE_ABI_mode32', +` mr r4, r3 + srdi r3, r3, 32 +') + blr + + +define(SHIFT,` +L(lo$1):ld r8, 0(up) + std r11, 0(rp) + addi rp, rp, 16 +L(o$1): srdi r10, r9, $1 + rldimi r10, r8, eval(64-$1), 0 + ld r9, 8(up) + addi up, up, 16 + std r10, -8(rp) +L(e$1): srdi r11, r8, $1 + rldimi r11, r9, eval(64-$1), 0 + bdnz L(lo$1) + std r11, 0(rp) + srdi r10, r9, $1 + b L(com) + nop + nop +') + + ALIGN(64) +forloop(`i',1,63,`SHIFT(i)') + +L(com): std r10, 8(rp) + mtlr r12 +ifdef(`HAVE_ABI_mode32', +` mr r4, r3 + srdi r3, r3, 32 +') + blr +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/powerpc64/p7/copyd.asm b/gmp-6.3.0/mpn/powerpc64/p7/copyd.asm new file mode 100644 index 0000000..f04ca58 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/p7/copyd.asm @@ -0,0 +1,128 @@ +dnl PowerPC-64 mpn_copyd. + +dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 1.25 +C POWER7 1.09 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +ASM_START() +PROLOGUE(mpn_copyd) + +ifdef(`HAVE_ABI_mode32', +` rldicl n, n, 0,32') + + sldi r0, n, 3 + add up, up, r0 C point at u[] end + add rp, rp, r0 C point at r[] end + + cmpdi cr0, n, 4 + blt L(sml) + + addi r10, n, 4 + srdi r10, r10, 3 + mtctr r10 + + andi. r0, n, 1 + rlwinm r11, n, 0,30,30 + rlwinm r12, n, 0,29,29 + cmpdi cr6, r11, 0 + cmpdi cr7, r12, 0 + + beq cr0, L(xx0) +L(xx1): ld r6, -8(up) + addi up, up, -8 + std r6, -8(rp) + addi rp, rp, -8 + +L(xx0): bne cr6, L(x10) +L(x00): ld r6, -8(up) + ld r7, -16(up) + bne cr7, L(100) +L(000): addi rp, rp, 32 + b L(lo0) +L(100): addi up, up, 32 + b L(lo4) +L(x10): ld r8, -8(up) + ld r9, -16(up) + bne cr7, L(110) +L(010): addi up, up, -16 + addi rp, rp, 16 + b L(lo2) +L(110): addi up, up, 16 + addi rp, rp, 48 + b L(lo6) + +L(sml): cmpdi cr0, n, 0 + beqlr- cr0 + mtctr n +L(t): ld r6, -8(up) + addi up, up, -8 + std r6, -8(rp) + addi rp, rp, -8 + bdnz L(t) + blr + + ALIGN(32) +L(top): std r6, -8(rp) + std r7, -16(rp) +L(lo2): ld r6, -8(up) + ld r7, -16(up) + std r8, -24(rp) + std r9, -32(rp) +L(lo0): ld r8, -24(up) + ld r9, -32(up) + std r6, -40(rp) + std r7, -48(rp) +L(lo6): ld r6, -40(up) + ld r7, -48(up) + std r8, -56(rp) + std r9, -64(rp) + addi rp, rp, -64 +L(lo4): ld r8, -56(up) + ld r9, -64(up) + addi up, up, -64 + bdnz L(top) + +L(end): std r6, -8(rp) + std r7, -16(rp) + std r8, -24(rp) + std r9, -32(rp) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/p7/copyi.asm b/gmp-6.3.0/mpn/powerpc64/p7/copyi.asm new file mode 100644 index 0000000..854cf9f --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/p7/copyi.asm @@ -0,0 +1,129 @@ +dnl PowerPC-64 mpn_copyi. + +dnl Copyright 2004, 2005, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 1.25 +C POWER7 1.09 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') + +C TODO +C * Try rolling the two loop leading std to the end, allowing the code to +C handle also n = 2. +C * Consider using 4 pointers, schedule ptr update early wrt use. + +ASM_START() +PROLOGUE(mpn_copyi) + +ifdef(`HAVE_ABI_mode32', +` rldicl n, n, 0,32') + + cmpdi cr0, n, 4 + blt L(sml) + + addi r10, n, 4 + srdi r10, r10, 3 + mtctr r10 + + andi. r0, n, 1 + rlwinm r11, n, 0,30,30 + rlwinm r12, n, 0,29,29 + cmpdi cr6, r11, 0 + cmpdi cr7, r12, 0 + + beq cr0, L(xx0) +L(xx1): ld r6, 0(up) + addi up, up, 8 + std r6, 0(rp) + addi rp, rp, 8 + +L(xx0): bne cr6, L(x10) +L(x00): ld r6, 0(up) + ld r7, 8(up) + bne cr7, L(100) +L(000): addi rp, rp, -32 + b L(lo0) +L(100): addi up, up, -32 + b L(lo4) +L(x10): ld r8, 0(up) + ld r9, 8(up) + bne cr7, L(110) +L(010): addi up, up, 16 + addi rp, rp, -16 + b L(lo2) +L(110): addi up, up, -16 + addi rp, rp, -48 + b L(lo6) + +L(sml): cmpdi cr0, n, 0 + beqlr- cr0 + mtctr n +L(t): ld r6, 0(up) + addi up, up, 8 + std r6, 0(rp) + addi rp, rp, 8 + bdnz L(t) + blr + + ALIGN(32) +L(top): std r6, 0(rp) + std r7, 8(rp) +L(lo2): ld r6, 0(up) + ld r7, 8(up) + std r8, 16(rp) + std r9, 24(rp) +L(lo0): ld r8, 16(up) + ld r9, 24(up) + std r6, 32(rp) + std r7, 40(rp) +L(lo6): ld r6, 32(up) + ld r7, 40(up) + std r8, 48(rp) + std r9, 56(rp) + addi rp, rp, 64 +L(lo4): ld r8, 48(up) + ld r9, 56(up) + addi up, up, 64 + bdnz L(top) + +L(end): std r6, 0(rp) + std r7, 8(rp) + std r8, 16(rp) + std r9, 24(rp) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/p7/hamdist.asm b/gmp-6.3.0/mpn/powerpc64/p7/hamdist.asm new file mode 100644 index 0000000..960b3bc --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/p7/hamdist.asm @@ -0,0 +1,110 @@ +dnl PowerPC-64 mpn_hamdist. + +dnl Copyright 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 2.87 + +define(`up', r3) +define(`vp', r4) +define(`n', r5) + +ASM_START() +PROLOGUE(mpn_hamdist) + std r30, -16(r1) + std r31, -8(r1) + + addi r0, n, 1 +ifdef(`HAVE_ABI_mode32', +` rldicl r0, r0, 63,33', C ...branch count +` srdi r0, r0, 1') C ...for ctr + mtctr r0 + + andi. r0, n, 1 + + li r0, 0 + li r12, 0 + + beq L(evn) + +L(odd): ld r6, 0(up) + addi up, up, 8 + ld r8, 0(vp) + addi vp, vp, 8 + xor r10, r6, r8 + popcntd(r0, r10) + bdz L(e1) + +L(evn): ld r6, 0(up) + ld r8, 0(vp) + ld r7, 8(up) + ld r9, 8(vp) + xor r10, r6, r8 + addi up, up, 16 + addi vp, vp, 16 + li r30, 0 + li r31, 0 + bdz L(end) + + nop + nop +C ALIGN(16) +L(top): add r0, r0, r30 + ld r6, 0(up) + ld r8, 0(vp) + xor r11, r7, r9 + popcntd(r30, r10) + add r12, r12, r31 + ld r7, 8(up) + ld r9, 8(vp) + xor r10, r6, r8 + popcntd(r31, r11) + addi up, up, 16 + addi vp, vp, 16 + bdnz L(top) + +L(end): add r0, r0, r30 + xor r11, r7, r9 + popcntd(r30, r10) + add r12, r12, r31 + popcntd(r31, r11) + + add r0, r0, r30 + add r12, r12, r31 +L(e1): add r3, r0, r12 + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/p7/popcount.asm b/gmp-6.3.0/mpn/powerpc64/p7/popcount.asm new file mode 100644 index 0000000..129ffef --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/p7/popcount.asm @@ -0,0 +1,90 @@ +dnl PowerPC-64 mpn_popcount. + +dnl Copyright 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 2 + +define(`up', r3) +define(`n', r4) + +ASM_START() +PROLOGUE(mpn_popcount) + addi r0, n, 1 +ifdef(`HAVE_ABI_mode32', +` rldicl r0, r0, 63,33', C ...branch count +` srdi r0, r0, 1') C ...for ctr + mtctr r0 + + andi. r0, n, 1 + + li r0, 0 + li r12, 0 + beq L(evn) + +L(odd): ld r4, 0(up) + addi up, up, 8 + popcntd(r0, r4) + bdz L(e1) + +L(evn): ld r4, 0(up) + ld r5, 8(up) + popcntd(r8, r4) + popcntd(r9, r5) + bdz L(e2) + + ld r4, 16(up) + ld r5, 24(up) + bdz L(e4) + addi up, up, 32 + +L(top): add r0, r0, r8 + popcntd(r8, r4) + ld r4, 0(up) + add r12, r12, r9 + popcntd(r9, r5) + ld r5, 8(up) + addi up, up, 16 + bdnz L(top) + +L(e4): add r0, r0, r8 + popcntd(r8, r4) + add r12, r12, r9 + popcntd(r9, r5) +L(e2): add r0, r0, r8 + add r12, r12, r9 +L(e1): add r3, r0, r12 + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/rshift.asm b/gmp-6.3.0/mpn/powerpc64/rshift.asm new file mode 100644 index 0000000..7654a16 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/rshift.asm @@ -0,0 +1,207 @@ +dnl PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt + +dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 2.25 +C POWER6 9.75 +C POWER7 2.15 + +C TODO +C * Try to reduce the number of needed live registers +C * Micro-optimise header code +C * Keep in synch with lshift.asm and lshiftc.asm + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`cnt', `r6') + +define(`tnc',`r0') +define(`u0',`r30') +define(`u1',`r31') +define(`retval',`r5') + +ASM_START() +PROLOGUE(mpn_rshift) + std r31, -8(r1) + std r30, -16(r1) + subfic tnc, cnt, 64 +C sldi r30, n, 3 C byte count corresponding to n +C add rp, rp, r30 C rp = rp + n +C add up, up, r30 C up = up + n + rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 + cmpdi cr6, r30, 2 + addi r31, n, 3 C compute count... + ld r10, 0(up) C load 1st limb for b00...b11 + sld retval, r10, tnc +ifdef(`HAVE_ABI_mode32', +` rldicl r31, r31, 62,34', C ...branch count +` srdi r31, r31, 2') C ...for ctr + mtctr r31 C copy count into ctr + beq cr0, L(b00) + blt cr6, L(b01) + ld r11, 8(up) C load 2nd limb for b10 and b11 + beq cr6, L(b10) + + ALIGN(16) +L(b11): srd r8, r10, cnt + sld r9, r11, tnc + ld u1, 16(up) + addi up, up, 24 + srd r12, r11, cnt + sld r7, u1, tnc + addi rp, rp, -16 + bdnz L(gt3) + + or r11, r8, r9 + srd r8, u1, cnt + b L(cj3) + + ALIGN(16) +L(gt3): ld u0, 0(up) + or r11, r8, r9 + srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 8(up) + or r10, r12, r7 + b L(L11) + + ALIGN(32) +L(b10): srd r12, r10, cnt + addi rp, rp, -24 + sld r7, r11, tnc + bdnz L(gt2) + + srd r8, r11, cnt + or r10, r12, r7 + b L(cj2) + +L(gt2): ld u0, 16(up) + srd r8, r11, cnt + sld r9, u0, tnc + ld u1, 24(up) + or r10, r12, r7 + srd r12, u0, cnt + sld r7, u1, tnc + ld u0, 32(up) + or r11, r8, r9 + addi up, up, 16 + b L(L10) + + ALIGN(16) +L(b00): ld u1, 8(up) + srd r12, r10, cnt + sld r7, u1, tnc + ld u0, 16(up) + srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 24(up) + or r10, r12, r7 + srd r12, u0, cnt + sld r7, u1, tnc + addi rp, rp, -8 + bdz L(cj4) + +L(gt4): addi up, up, 32 + ld u0, 0(up) + or r11, r8, r9 + b L(L00) + + ALIGN(16) +L(b01): bdnz L(gt1) + srd r8, r10, cnt + std r8, 0(rp) + b L(ret) + +L(gt1): ld u0, 8(up) + srd r8, r10, cnt + sld r9, u0, tnc + ld u1, 16(up) + srd r12, u0, cnt + sld r7, u1, tnc + ld u0, 24(up) + or r11, r8, r9 + srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 32(up) + addi up, up, 40 + or r10, r12, r7 + bdz L(end) + + ALIGN(32) +L(top): srd r12, u0, cnt + sld r7, u1, tnc + ld u0, 0(up) + std r11, 0(rp) + or r11, r8, r9 +L(L00): srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 8(up) + std r10, 8(rp) + or r10, r12, r7 +L(L11): srd r12, u0, cnt + sld r7, u1, tnc + ld u0, 16(up) + std r11, 16(rp) + or r11, r8, r9 +L(L10): srd r8, u1, cnt + sld r9, u0, tnc + ld u1, 24(up) + addi up, up, 32 + std r10, 24(rp) + addi rp, rp, 32 + or r10, r12, r7 + bdnz L(top) + + ALIGN(32) +L(end): srd r12, u0, cnt + sld r7, u1, tnc + std r11, 0(rp) +L(cj4): or r11, r8, r9 + srd r8, u1, cnt + std r10, 8(rp) +L(cj3): or r10, r12, r7 + std r11, 16(rp) +L(cj2): std r10, 24(rp) + std r8, 32(rp) + +L(ret): ld r31, -8(r1) + ld r30, -16(r1) +ifdef(`HAVE_ABI_mode32', +` srdi r3, retval, 32 + mr r4, retval +',` mr r3, retval') + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/sec_tabselect.asm b/gmp-6.3.0/mpn/powerpc64/sec_tabselect.asm new file mode 100644 index 0000000..085577c --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/sec_tabselect.asm @@ -0,0 +1,147 @@ +dnl PowerPC-64 mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 1.75 +C POWER4/PPC970 2.0 +C POWER5 ? +C POWER6 5.0 +C POWER7 1.75 + +define(`rp', `r3') +define(`tp', `r4') +define(`n', `r5') +define(`nents', `r6') +define(`which', `r7') + +define(`i', `r8') +define(`j', `r9') +define(`stride', `r12') +define(`mask', `r11') + + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + addic. j, n, -4 C outer loop induction variable + std r31, -8(r1) + std r30, -16(r1) + std r29, -24(r1) + std r28, -32(r1) + std r27, -40(r1) + sldi stride, n, 3 + + blt cr0, L(outer_end) +L(outer_top): + mtctr nents + mr r10, tp + li r28, 0 + li r29, 0 + li r30, 0 + li r31, 0 + addic. j, j, -4 C outer loop induction variable + mr i, which + + ALIGN(16) +L(top): addic i, i, -1 C set carry iff i != 0 + subfe mask, mask, mask + ld r0, 0(tp) + ld r27, 8(tp) + and r0, r0, mask + and r27, r27, mask + or r28, r28, r0 + or r29, r29, r27 + ld r0, 16(tp) + ld r27, 24(tp) + and r0, r0, mask + and r27, r27, mask + or r30, r30, r0 + or r31, r31, r27 + add tp, tp, stride + bdnz L(top) + + std r28, 0(rp) + std r29, 8(rp) + std r30, 16(rp) + std r31, 24(rp) + addi tp, r10, 32 + addi rp, rp, 32 + bge cr0, L(outer_top) +L(outer_end): + + rldicl. r0, n, 63, 63 + beq cr0, L(b0x) +L(b1x): mtctr nents + mr r10, tp + li r28, 0 + li r29, 0 + mr i, which + ALIGN(16) +L(tp2): addic i, i, -1 + subfe mask, mask, mask + ld r0, 0(tp) + ld r27, 8(tp) + and r0, r0, mask + and r27, r27, mask + or r28, r28, r0 + or r29, r29, r27 + add tp, tp, stride + bdnz L(tp2) + std r28, 0(rp) + std r29, 8(rp) + addi tp, r10, 16 + addi rp, rp, 16 + +L(b0x): rldicl. r0, n, 0, 63 + beq cr0, L(b00) +L(b01): mtctr nents + mr r10, tp + li r28, 0 + mr i, which + ALIGN(16) +L(tp1): addic i, i, -1 + subfe mask, mask, mask + ld r0, 0(tp) + and r0, r0, mask + or r28, r28, r0 + add tp, tp, stride + bdnz L(tp1) + std r28, 0(rp) + +L(b00): ld r31, -8(r1) + ld r30, -16(r1) + ld r29, -24(r1) + ld r28, -32(r1) + ld r27, -40(r1) + blr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/powerpc64/umul.asm b/gmp-6.3.0/mpn/powerpc64/umul.asm new file mode 100644 index 0000000..7fcc72f --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/umul.asm @@ -0,0 +1,53 @@ +dnl PowerPC-64 umul_ppmm -- support for longlong.h + +dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2); +C + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + + C r3 lowptr + C r4 m1 + C r5 m2 + + mulld r0, r4, r5 + mulhdu r4, r4, r5 + std r0, 0(r3) +ifdef(`HAVE_ABI_mode32', +` srdi r3, r4, 32 +',` mr r3, r4 +') + blr + +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/powerpc64/vmx/popcount.asm b/gmp-6.3.0/mpn/powerpc64/vmx/popcount.asm new file mode 100644 index 0000000..b95fb88 --- /dev/null +++ b/gmp-6.3.0/mpn/powerpc64/vmx/popcount.asm @@ -0,0 +1,230 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount. + +dnl Copyright 2006, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 7400,7410 (G4): ? +C 744x,745x (G4+): 1.125 +C 970 (G5): 2.25 + +C TODO +C * Rewrite the awkward huge n outer loop code. +C * Two lvx, two vperm, and two vxor could make us a similar hamdist. +C * Compress cnsts table in 64-bit mode, only half the values are needed. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + +define(`OPERATION_popcount') + +define(`ap', `r3') +define(`n', `r4') + +define(`rtab', `v10') +define(`cnt4', `v11') + +ifelse(GMP_LIMB_BITS,32,` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow +C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK. +define(`LIMBS_PER_CHUNK', 0x1000) +define(`LIMBS_CHUNK_THRES', 0x1001) + +ASM_START() +PROLOGUE(mpn_popcount,toc) + mfspr r10, 256 + oris r0, r10, 0xfffc C Set VRSAVE bit 0-13 + mtspr 256, r0 + +ifdef(`HAVE_ABI_mode32', +` rldicl n, n, 0, 32') C zero extend n + +C Load various constants into vector registers + LEAL( r11, cnsts) + li r12, 16 + vspltisb cnt4, 4 C 0x0404...04 used as shift count + + li r7, 160 + lvx rtab, 0, r11 + +LIMB64(`lis r0, LIMBS_CHUNK_THRES ') +LIMB64(`cmpd cr7, n, r0 ') + + lvx v0, 0, ap + addi r7, r11, 80 + rlwinm r6, ap, 2,26,29 + lvx v8, r7, r6 + vand v0, v0, v8 + +LIMB32(`rlwinm r8, ap, 30,30,31 ') +LIMB64(`rlwinm r8, ap, 29,31,31 ') + add n, n, r8 C compensate n for rounded down `ap' + + vxor v1, v1, v1 + li r8, 0 C grand total count + + vxor v12, v12, v12 C zero total count + vxor v13, v13, v13 C zero total count + + addic. n, n, -LIMBS_PER_VR + ble L(sum) + + addic. n, n, -LIMBS_PER_VR + ble L(lsum) + +C For 64-bit machines, handle huge n that would overflow vsum4ubs +LIMB64(`ble cr7, L(small) ') +LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n +LIMB64(`lis n, LIMBS_PER_CHUNK ') + + ALIGN(16) +L(small): +LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n +LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n + addi r7, r7, 1 + mtctr r7 C copy n to count register + b L(ent) + + ALIGN(16) +L(top): + lvx v0, 0, ap +L(ent): lvx v1, r12, ap + addi ap, ap, 32 + vsrb v8, v0, cnt4 + vsrb v9, v1, cnt4 + vperm v2, rtab, rtab, v0 + vperm v3, rtab, rtab, v8 + vperm v4, rtab, rtab, v1 + vperm v5, rtab, rtab, v9 + vaddubm v6, v2, v3 + vaddubm v7, v4, v5 + vsum4ubs v12, v6, v12 + vsum4ubs v13, v7, v13 + bdnz L(top) + + andi. n, n, eval(LIMBS_PER_2VR-1) + beq L(rt) + + lvx v0, 0, ap + vxor v1, v1, v1 + cmpwi n, LIMBS_PER_VR + ble L(sum) +L(lsum): + vor v1, v0, v0 + lvx v0, r12, ap +L(sum): +LIMB32(`rlwinm r6, n, 4,26,27 ') +LIMB64(`rlwinm r6, n, 5,26,26 ') + addi r7, r11, 16 + lvx v8, r7, r6 + vand v0, v0, v8 + vsrb v8, v0, cnt4 + vsrb v9, v1, cnt4 + vperm v2, rtab, rtab, v0 + vperm v3, rtab, rtab, v8 + vperm v4, rtab, rtab, v1 + vperm v5, rtab, rtab, v9 + vaddubm v6, v2, v3 + vaddubm v7, v4, v5 + vsum4ubs v12, v6, v12 + vsum4ubs v13, v7, v13 + + ALIGN(16) +L(rt): vadduwm v3, v12, v13 + li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs + stvx v3, r7, r1 C FIXME: ...support storing below sp? + + lwz r7, -16(r1) + add r8, r8, r7 + lwz r7, -12(r1) + add r8, r8, r7 + lwz r7, -8(r1) + add r8, r8, r7 + lwz r7, -4(r1) + add r8, r8, r7 + +C Handle outer loop for huge n. We inherit cr7 and r0 from above. +LIMB64(`ble cr7, L(ret) + vxor v12, v12, v12 C zero total count + vxor v13, v13, v13 C zero total count + mr n, r9 + cmpd cr7, n, r0 + ble cr7, L(2) + addis r9, n, -LIMBS_PER_CHUNK C remaining n + lis n, LIMBS_PER_CHUNK +L(2): srdi r7, n, 2 C loop count corresponding to n + mtctr r7 C copy n to count register + b L(top) +') + + ALIGN(16) +L(ret): mr r3, r8 + mtspr 256, r10 + blr +EPILOGUE() + +DEF_OBJECT(cnsts,16) +C Counts for vperm + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 +C Masks for high end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(cnsts) +ASM_END() diff --git a/gmp-6.3.0/mpn/powlo.c b/gmp-6.3.0/mpn/powlo.c new file mode 120000 index 0000000..62a2d78 --- /dev/null +++ b/gmp-6.3.0/mpn/powlo.c @@ -0,0 +1 @@ +../mpn/generic/powlo.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/powm.c b/gmp-6.3.0/mpn/powm.c new file mode 120000 index 0000000..b1d8744 --- /dev/null +++ b/gmp-6.3.0/mpn/powm.c @@ -0,0 +1 @@ +../mpn/generic/powm.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/pre_mod_1.c b/gmp-6.3.0/mpn/pre_mod_1.c new file mode 120000 index 0000000..ed1e6e2 --- /dev/null +++ b/gmp-6.3.0/mpn/pre_mod_1.c @@ -0,0 +1 @@ +../mpn/generic/pre_mod_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/random.c b/gmp-6.3.0/mpn/random.c new file mode 120000 index 0000000..b84c7c6 --- /dev/null +++ b/gmp-6.3.0/mpn/random.c @@ -0,0 +1 @@ +../mpn/generic/random.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/random2.c b/gmp-6.3.0/mpn/random2.c new file mode 120000 index 0000000..e9bd5b1 --- /dev/null +++ b/gmp-6.3.0/mpn/random2.c @@ -0,0 +1 @@ +../mpn/generic/random2.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/redc_1.c b/gmp-6.3.0/mpn/redc_1.c new file mode 120000 index 0000000..61d4a17 --- /dev/null +++ b/gmp-6.3.0/mpn/redc_1.c @@ -0,0 +1 @@ +../mpn/generic/redc_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/redc_2.c b/gmp-6.3.0/mpn/redc_2.c new file mode 120000 index 0000000..b2ede58 --- /dev/null +++ b/gmp-6.3.0/mpn/redc_2.c @@ -0,0 +1 @@ +../mpn/generic/redc_2.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/redc_n.c b/gmp-6.3.0/mpn/redc_n.c new file mode 120000 index 0000000..e9c260f --- /dev/null +++ b/gmp-6.3.0/mpn/redc_n.c @@ -0,0 +1 @@ +../mpn/generic/redc_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/remove.c b/gmp-6.3.0/mpn/remove.c new file mode 120000 index 0000000..057e2ca --- /dev/null +++ b/gmp-6.3.0/mpn/remove.c @@ -0,0 +1 @@ +../mpn/generic/remove.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/riscv/64/aors_n.asm b/gmp-6.3.0/mpn/riscv/64/aors_n.asm new file mode 100644 index 0000000..d267037 --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/aors_n.asm @@ -0,0 +1,135 @@ +dnl RISC-V/64 mpn_add_n and mpn_sub_n. + +dnl Copyright 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`ap', `a1') +define(`bp', `a2') +define(`n', `a3') + +define(`i', `a6') + +ifdef(`OPERATION_add_n',` + define(`ADDSUB', `add') + define(`CMPCY', `sltu $1, $2, $3') + define(`func', `mpn_add_n') +') +ifdef(`OPERATION_sub_n',` + define(`ADDSUB', `sub') + define(`CMPCY', `sltu $1, $3, $4') + define(`func', `mpn_sub_n') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) + +ASM_START() +PROLOGUE(func) + li t6, 0 + srli i, n, 2 + + ld a4, 0(ap) + ld a5, 0(bp) + + andi t0, n, 1 + andi t1, n, 2 + bnez t0, L(bx1) +L(bx0): bnez t1, L(b10) +L(b00): addi rp, rp, -8 + addi i, i, -1 + j L(b0) +L(b10): addi bp, bp, -16 + addi ap, ap, -16 + addi rp, rp, -24 + j L(b2) +L(bx1): bnez t1, L(b11) +L(b01): beqz i, L(1) + addi bp, bp, 8 + addi ap, ap, 8 + addi i, i, -1 + j L(b1) +L(1): ADDSUB t0, a4, a5 + sd t0, 0(rp) + CMPCY( a0, t0, a4, a5) + ret +L(b11): addi bp, bp, -8 + addi ap, ap, -8 + addi rp, rp, -16 + j L(b3) + + ALIGN( 16) +L(top): addi bp, bp, 32 + addi ap, ap, 32 + addi rp, rp, 32 + addi i, i, -1 +L(b1): ADDSUB t0, a4, a5 + CMPCY( t2, t0, a4, a5) + ld a4, 0(ap) + ld a5, 0(bp) + ADDSUB t4, t0, t6 + CMPCY( t3, t4, t0, t6) + sd t4, 0(rp) + or t6, t2, t3 + +L(b0): ADDSUB t1, a4, a5 + CMPCY( t2, t1, a4, a5) + ld a4, 8(ap) + ld a5, 8(bp) + ADDSUB t4, t1, t6 + CMPCY( t3, t4, t1, t6) + sd t4, 8(rp) + or t6, t2, t3 +L(b3): ADDSUB t0, a4, a5 + CMPCY( t2, t0, a4, a5) + ld a4, 16(ap) + ld a5, 16(bp) + ADDSUB t4, t0, t6 + CMPCY( t3, t4, t0, t6) + sd t4, 16(rp) + or t6, t2, t3 +L(b2): ADDSUB t1, a4, a5 + CMPCY( t2, t1, a4, a5) + ld a4, 24(ap) + ld a5, 24(bp) + ADDSUB t4, t1, t6 + CMPCY( t3, t4, t1, t6) + sd t4, 24(rp) + or t6, t2, t3 + bne i, x0, L(top) + +L(end): ADDSUB t0, a4, a5 + CMPCY( t2, t0, a4, a5) + ADDSUB t4, t0, t6 + CMPCY( t3, t4, t0, t6) + sd t4, 32(rp) + or a0, t2, t3 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/riscv/64/aorsmul_1.asm b/gmp-6.3.0/mpn/riscv/64/aorsmul_1.asm new file mode 100644 index 0000000..1125a9f --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/aorsmul_1.asm @@ -0,0 +1,75 @@ +dnl RISC-V/64 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`up', `a1') +define(`n', `a2') +define(`v0', `a3') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`CMPCY', `sltu $1, $2, $3') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`CMPCY', `sltu $1, $3, $2') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + li a6, 0 + +L(top): ld a7, 0(up) + addi up, up, 8 C bookkeeping + ld a4, 0(rp) + addi rp, rp, 8 C bookkeeping + mul a5, a7, v0 + addi n, n, -1 C bookkeeping + mulhu a7, a7, v0 + ADDSUB a5, a4, a5 + ADDSUB a6, a5, a6 C cycle 0, 3, ... + CMPCY( a4, a5, a4) + add a4, a4, a7 + CMPCY( a5, a6, a5) C cycle 1, 4, ... + sd a6, -8(rp) + add a6, a4, a5 C cycle 2, 5, ... + bne n, x0, L(top) C bookkeeping + +L(end): mv a0, a6 + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/riscv/64/cnd_aors_n.asm b/gmp-6.3.0/mpn/riscv/64/cnd_aors_n.asm new file mode 100644 index 0000000..b3d5651 --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/cnd_aors_n.asm @@ -0,0 +1,97 @@ +dnl RISC-V/64 mpn_cnd_add_n and mpn_cnd_sub_n. + +dnl Copyright 2016, 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`cnd', `a0') +define(`rp', `a1') +define(`up', `a2') +define(`vp', `a3') +define(`n', `a4') + +define(`mask', `t5') + +ifdef(`OPERATION_cnd_add_n',` + define(`ADDSUB', `add') + define(`CMPCY', `sltu $1, $2, $3') + define(`func', `mpn_cnd_add_n') +') +ifdef(`OPERATION_cnd_sub_n',` + define(`ADDSUB', `sub') + define(`CMPCY', `sltu $1, $3, $4') + define(`func', `mpn_cnd_sub_n') +') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + li t6, 0 + + sltiu mask, cnd, 1 + addi mask, mask, -1 + + andi t0, n, 1 + beq t0, x0, L(top) + addi up, up, 8 + addi vp, vp, -8 + addi rp, rp, -8 + addi n, n, -1 + j L(mid) + +L(top): ld a0, 0(up) + ld a6, 0(vp) + addi n, n, -2 C bookkeeping + addi up, up, 16 C bookkeeping + and a6, a6, mask + ADDSUB t0, a0, a6 + CMPCY( t2, t0, a0, a6) + ADDSUB t4, t0, t6 C cycle 3, 9, ... + CMPCY( t3, t4, t0, t6) C cycle 4, 10, ... + sd t4, 0(rp) + add t6, t2, t3 C cycle 5, 11, ... +L(mid): ld a5, -8(up) + ld a7, 8(vp) + addi vp, vp, 16 C bookkeeping + addi rp, rp, 16 C bookkeeping + and a7, a7, mask + ADDSUB t1, a5, a7 + CMPCY( t2, t1, a5, a7) + ADDSUB t4, t1, t6 C cycle 0, 6, ... + CMPCY( t3, t4, t1, t6) C cycle 1, 7, ... + sd t4, -8(rp) + add t6, t2, t3 C cycle 2, 8, ... + bne n, x0, L(top) C bookkeeping + +L(end): mv a0, t6 + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/riscv/64/copyd.asm b/gmp-6.3.0/mpn/riscv/64/copyd.asm new file mode 100644 index 0000000..42557f8 --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/copyd.asm @@ -0,0 +1,87 @@ +dnl RISC-V/64 mpn_copyd + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`ap', `a1') +define(`n', `a2') + +define(`i', `a6') + +ASM_START() +PROLOGUE(mpn_copyd) + slli t0, n, 3 + add ap, ap, t0 + add rp, rp, t0 + srli i, n, 2 + + andi t0, n, 1 + andi t1, n, 2 + bnez t0, L(bx1) +L(bx0): beqz n, L(ret) + ld t0, -8(ap) + bnez t1, L(b10) +L(b00): addi rp, rp, 8 + addi i, i, -1 + j L(b0) +L(b10): addi ap, ap, 16 + addi rp, rp, 24 + j L(b2) +L(bx1): ld t2, -8(ap) + bnez t1, L(b11) + beqz i, L(1) + addi ap, ap, -8 + addi i, i, -1 + j L(b1) +L(1): sd t2, -8(rp) + ret +L(b11): addi ap, ap, 8 + addi rp, rp, 16 + j L(b3) + + ALIGN( 16) +L(top): addi ap, ap, -32 + addi rp, rp, -32 + addi i, i, -1 +L(b1): ld t0, -8(ap) + sd t2, -8(rp) +L(b0): ld t2, -16(ap) + sd t0, -16(rp) +L(b3): ld t0, -24(ap) + sd t2, -24(rp) +L(b2): ld t2, -32(ap) + sd t0, -32(rp) + bnez i, L(top) + +L(end): sd t2, -40(rp) +L(ret): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/riscv/64/copyi.asm b/gmp-6.3.0/mpn/riscv/64/copyi.asm new file mode 100644 index 0000000..7a0b7fa --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/copyi.asm @@ -0,0 +1,84 @@ +dnl RISC-V/64 mpn_copyi + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`ap', `a1') +define(`n', `a2') + +define(`i', `a6') + +ASM_START() +PROLOGUE(mpn_copyi) + srli i, n, 2 + + andi t0, n, 1 + andi t1, n, 2 + bnez t0, L(bx1) +L(bx0): beqz n, L(ret) + ld t0, 0(ap) + bnez t1, L(b10) +L(b00): addi rp, rp, -8 + addi i, i, -1 + j L(b0) +L(b10): addi ap, ap, -16 + addi rp, rp, -24 + j L(b2) +L(bx1): ld t2, 0(ap) + bnez t1, L(b11) + beqz i, L(1) + addi ap, ap, 8 + addi i, i, -1 + j L(b1) +L(1): sd t2, 0(rp) + ret +L(b11): addi ap, ap, -8 + addi rp, rp, -16 + j L(b3) + + ALIGN( 16) +L(top): addi ap, ap, 32 + addi rp, rp, 32 + addi i, i, -1 +L(b1): ld t0, 0(ap) + sd t2, 0(rp) +L(b0): ld t2, 8(ap) + sd t0, 8(rp) +L(b3): ld t0, 16(ap) + sd t2, 16(rp) +L(b2): ld t2, 24(ap) + sd t0, 24(rp) + bnez i, L(top) + +L(end): sd t2, 32(rp) +L(ret): ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/riscv/64/lshift.asm b/gmp-6.3.0/mpn/riscv/64/lshift.asm new file mode 100644 index 0000000..413063f --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/lshift.asm @@ -0,0 +1,121 @@ +dnl RISC-V/64 mpn_lshift + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp_arg',`a0') +define(`ap', `a1') +define(`n', `a2') +define(`cnt', `a3') + +define(`rp', `a4') +define(`tnc', `t5') +define(`i', `a7') + +ASM_START() +PROLOGUE(mpn_lshift) + slli t0, n, 3 + add ap, ap, t0 + add rp, rp_arg, t0 + sub tnc, x0, cnt + srli i, n, 2 + + ld t0, -8(ap) + srl a0, t0, tnc + + andi t6, n, 1 + andi a5, n, 2 + bnez t6, L(bx1) + + sll t3, t0, cnt + ld t0, -16(ap) + addi i, i, -1 + bnez a5, L(b10) + addi rp, rp, 16 + j L(b0) +L(b10): addi ap, ap, -16 + bge i, x0, L(b2) +L(eq2): srl t4, t0, tnc + sll t2, t0, cnt + or t4, t3, t4 + sd t4, -8(rp) + sd t2, -16(rp) + ret + +L(bx1): sll t2, t0, cnt + bnez a5, L(b11) + bnez i, L(gt1) + sd t2, -8(rp) + ret +L(gt1): ld t0, -16(ap) + addi ap, ap, -8 + addi rp, rp, 8 + addi i, i, -1 + j L(b1) +L(b11): ld t0, -16(ap) + addi ap, ap, 8 + addi rp, rp, 24 + j L(b3) + +L(top): addi ap, ap, -32 + addi rp, rp, -32 + addi i, i, -1 +L(b2): srl t4, t0, tnc + sll t2, t0, cnt + ld t0, -8(ap) + or t4, t3, t4 + sd t4, -8(rp) +L(b1): srl t4, t0, tnc + sll t3, t0, cnt + ld t0, -16(ap) + or t4, t2, t4 + sd t4, -16(rp) +L(b0): srl t4, t0, tnc + sll t2, t0, cnt + ld t0, -24(ap) + or t4, t3, t4 + sd t4, -24(rp) +L(b3): srl t4, t0, tnc + sll t3, t0, cnt + ld t0, -32(ap) + or t4, t2, t4 + sd t4, -32(rp) + bnez i, L(top) + +L(end): srl t4, t0, tnc + sll t2, t0, cnt + or t4, t3, t4 + sd t4, -40(rp) + sd t2, -48(rp) + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/riscv/64/mul_1.asm b/gmp-6.3.0/mpn/riscv/64/mul_1.asm new file mode 100644 index 0000000..e35eaa9 --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/mul_1.asm @@ -0,0 +1,58 @@ +dnl RISC-V/64 mpn_mul_1. + +dnl Copyright 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`up', `a1') +define(`n', `a2') +define(`v0', `a3') + +ASM_START() +PROLOGUE(mpn_mul_1) + li a6, 0 + +L(top): ld a7, 0(up) + addi up, up, 8 C bookkeeping + addi rp, rp, 8 C bookkeeping + mul a5, a7, v0 + addi n, n, -1 C bookkeeping + mulhu a7, a7, v0 + add a6, a5, a6 C cycle 0, 3, ... + sltu a5, a6, a5 C cycle 1, 4, ... + sd a6, -8(rp) + add a6, a7, a5 C cycle 2, 5, ... + bne n, x0, L(top) C bookkeeping + +L(end): mv a0, a6 + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/riscv/64/rshift.asm b/gmp-6.3.0/mpn/riscv/64/rshift.asm new file mode 100644 index 0000000..0069765 --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/rshift.asm @@ -0,0 +1,119 @@ +dnl RISC-V/64 mpn_rshift + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp_arg',`a0') +define(`ap', `a1') +define(`n', `a2') +define(`cnt', `a3') + +define(`rp', `a4') +define(`tnc', `t5') +define(`i', `a7') + +ASM_START() +PROLOGUE(mpn_rshift) + mv rp, rp_arg + sub tnc, x0, cnt + srli i, n, 2 + + ld t0, 0(ap) + sll a0, t0, tnc + + andi t6, n, 1 + andi a5, n, 2 + bnez t6, L(bx1) + + srl t3, t0, cnt + ld t0, 8(ap) + addi i, i, -1 + bnez a5, L(b10) + addi rp, rp, -16 + j L(b0) +L(b10): addi ap, ap, 16 + bge i, x0, L(b2) +L(eq2): sll t4, t0, tnc + srl t2, t0, cnt + or t4, t3, t4 + sd t4, (rp) + sd t2, 8(rp) + ret + +L(bx1): srl t2, t0, cnt + bnez a5, L(b11) + bnez i, L(gt1) + sd t2, (rp) + ret +L(gt1): ld t0, 8(ap) + addi ap, ap, 8 + addi rp, rp, -8 + addi i, i, -1 + j L(b1) +L(b11): ld t0, 8(ap) + addi ap, ap, -8 + addi rp, rp, -24 + j L(b3) + +L(top): addi ap, ap, 32 + addi rp, rp, 32 + addi i, i, -1 +L(b2): sll t4, t0, tnc + srl t2, t0, cnt + ld t0, 0(ap) + or t4, t3, t4 + sd t4, 0(rp) +L(b1): sll t4, t0, tnc + srl t3, t0, cnt + ld t0, 8(ap) + or t4, t2, t4 + sd t4, 8(rp) +L(b0): sll t4, t0, tnc + srl t2, t0, cnt + ld t0, 16(ap) + or t4, t3, t4 + sd t4, 16(rp) +L(b3): sll t4, t0, tnc + srl t3, t0, cnt + ld t0, 24(ap) + or t4, t2, t4 + sd t4, 24(rp) + bnez i, L(top) + +L(end): sll t4, t0, tnc + srl t2, t0, cnt + or t4, t3, t4 + sd t4, 32(rp) + sd t2, 40(rp) + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/riscv/64/sec_tabselect.asm b/gmp-6.3.0/mpn/riscv/64/sec_tabselect.asm new file mode 100644 index 0000000..35211fd --- /dev/null +++ b/gmp-6.3.0/mpn/riscv/64/sec_tabselect.asm @@ -0,0 +1,140 @@ +dnl RISC-V/64 mpn_sec_tabselect + +dnl Copyright 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`tp', `a1') +define(`n', `a2') +define(`nents', `a3') +define(`which', `a4') + +define(`i', `a6') +define(`j', `a7') +define(`mask', `s0') +define(`k', `nents') +define(`one', `s3') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + addi sp,sp,-32 + sd s0,24(sp) + sd s1,16(sp) + sd s2,8(sp) + sd s3,0(sp) + + addi j, n, -4 + slli n, n, 3 + li one, 1 + + sub k, which, nents + blt j, zero, L(outer_end) +L(outer_top): + mv s2, tp + li t0, 0 + li t1, 0 + li t2, 0 + li t3, 0 + addi j, j, -4 + mv i, which + + ALIGN(16) +L(top): ld t4, 0(tp) + ld t5, 8(tp) + sltu mask, i, one + addi i, i, -1 + neg mask, mask + ld t6, 16(tp) + ld a5, 24(tp) + and t4, mask, t4 + and t5, mask, t5 + or t0, t4, t0 + or t1, t5, t1 + and t6, mask, t6 + and a5, mask, a5 + or t2, t6, t2 + or t3, a5, t3 + add tp, tp, n + bne i, k, L(top) + + sd t0, 0(rp) + sd t1, 8(rp) + sd t2, 16(rp) + sd t3, 24(rp) + add tp, s2, 32 + add rp, rp, 32 + bge j, zero, L(outer_top) +L(outer_end): + andi t0, n, 2*8 + beq t0, zero, L(b0x) +L(b1x): mv s2, tp + li t0, 0 + li t1, 0 + mv i, which + ALIGN(16) +L(tp2): ld t4, 0(tp) + ld t5, 8(tp) + sltu mask, i, one + neg mask, mask + addi i, i, -1 + and t4, mask, t4 + and t5, mask, t5 + or t0, t4, t0 + or t1, t5, t1 + add tp, tp, n + bne i, k, L(tp2) + sd t0, 0(rp) + sd t1, 8(rp) + addi tp, s2, 16 + addi rp, rp, 16 + +L(b0x): andi t0, n, 1*8 + beq t0, zero, L(b00) +L(b01): li t0, 0 + mv i, which + ALIGN(16) +L(tp1): ld t4, 0(tp) + sltu mask, i, one + neg mask, mask + addi i, i, -1 + and t4, mask, t4 + or t0, t4, t0 + add tp, tp, n + bne i, k, L(tp1) + sd t0, 0(rp) + +L(b00): ld s0,24(sp) + ld s1,16(sp) + ld s2,8(sp) + ld s3,0(sp) + addi sp,sp,32 + jr ra +EPILOGUE() diff --git a/gmp-6.3.0/mpn/rootrem.c b/gmp-6.3.0/mpn/rootrem.c new file mode 120000 index 0000000..ac052e8 --- /dev/null +++ b/gmp-6.3.0/mpn/rootrem.c @@ -0,0 +1 @@ +../mpn/generic/rootrem.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/rshift.asm b/gmp-6.3.0/mpn/rshift.asm new file mode 120000 index 0000000..7b0cd8b --- /dev/null +++ b/gmp-6.3.0/mpn/rshift.asm @@ -0,0 +1 @@ +../mpn/x86/p6/mmx/rshift.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/s390_32/README b/gmp-6.3.0/mpn/s390_32/README new file mode 100644 index 0000000..59519ba --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/README @@ -0,0 +1,37 @@ +All current (2001) S/390 and z/Architecture machines are single-issue, +but some newer machines have a deep pipeline. Software-pipelining is +therefore beneficial. + +* mpn_add_n, mpn_sub_n: Use code along the lines below. Two-way unrolling + would be adequate. + + mp_limb_t + mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) + { + mp_limb_t a, b, r, cy; + mp_size_t i; + mp_limb_t mm = -1; + + cy = 0; + up += n; + vp += n; + rp += n; + i = -n; + do + { + a = up[i]; + b = vp[i]; + r = a + b + cy; + rp[i] = r; + cy = (((a & b) | ((a | b) & (r ^ mm)))) >> 31; + i++; + } + while (i < 0); + return cy; + } + +* mpn_lshift, mpn_rshift: Use SLDL/SRDL, and two-way unrolling. + +* mpn_mul_1, mpn_addmul_1, mpn_submul_1: For machines with just signed + multiply (MR), use two loops, similar to the corresponding VAX or + POWER functions. Handle carry like for mpn_add_n. diff --git a/gmp-6.3.0/mpn/s390_32/addmul_1.asm b/gmp-6.3.0/mpn/s390_32/addmul_1.asm new file mode 100644 index 0000000..97189a8 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/addmul_1.asm @@ -0,0 +1,93 @@ +dnl S/390 mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(`rp',2) +define(`up',3) +define(`n',4) +define(`vlimb',5) +define(`cylimb',7) + +ASM_START() +PROLOGUE(mpn_addmul_1) + stm 6,7,24(15) + slr cylimb,cylimb # clear cylimb + ltr vlimb,vlimb + jnl .Loopp + +.Loopn: l 1,0(up) # load from u + lr 6,1 # + mr 0,vlimb # multiply signed + alr 0,6 # add vlimb to phi + sra 6,31 # make mask + nr 6,vlimb # 0 or vlimb + alr 0,6 # conditionally add vlimb to phi + alr 1,cylimb # add carry limb to plo + brc 8+4,+8 # branch if not carry + ahi 0,1 # increment phi + l 6,0(rp) # load r limb + alr 6,1 # add u limb to plo + brc 8+4,+8 # branch if not carry + ahi 0,1 # increment phi + lr cylimb,0 # new cylimb + st 6,0(rp) # store + la up,4(,up) + la rp,4(,rp) + brct n,.Loopn + + lr 2,cylimb + lm 6,7,24(15) + br 14 + +.Loopp: l 1,0(up) # load from u + lr 6,1 # + mr 0,vlimb # multiply signed + sra 6,31 # make mask + nr 6,vlimb # 0 or vlimb + alr 0,6 # conditionally add vlimb to phi + alr 1,cylimb # add carry limb to plo + brc 8+4,+8 # branch if not carry + ahi 0,1 # increment phi + l 6,0(rp) # load r limb + alr 6,1 # add u limb to plo + brc 8+4,+8 # branch if not carry + ahi 0,1 # increment phi + lr cylimb,0 # new cylimb + st 6,0(rp) # store + la up,4(,up) + la rp,4(,rp) + brct n,.Loopp + + lr 2,cylimb + lm 6,7,24(15) + br 14 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/s390_32/copyd.asm b/gmp-6.3.0/mpn/s390_32/copyd.asm new file mode 100644 index 0000000..ff252bc --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/copyd.asm @@ -0,0 +1,145 @@ +dnl S/390-32 mpn_copyd + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C cycles/limb +C z900 1.65 +C z990 1.125 +C z9 ? +C z10 ? +C z196 ? + +C FIXME: +C * Avoid saving/restoring callee-saves registers for n < 3. This could be +C done by setting rp=r1, up=r2, i=r0 and r3,r4,r5 for clock regs. +C We could then use r3...r10 in main loop. + +C INPUT PARAMETERS +define(`rp_param', `%r2') +define(`up_param', `%r3') +define(`n', `%r4') + +define(`rp', `%r8') +define(`up', `%r9') + +ASM_START() +PROLOGUE(mpn_copyd) + stm %r6, %r11, 24(%r15) + + lr %r1, n + sll %r1, 2 + la %r10, 8(n) + ahi %r1, -32 + srl %r10, 3 + lhi %r11, -32 + + la rp, 0(%r1,rp_param) C FIXME use lay on z990 and later + la up, 0(%r1,up_param) C FIXME use lay on z990 and later + + lhi %r7, 7 + nr %r7, n C n mod 8 + chi %r7, 2 + jh L(b34567) + chi %r7, 1 + je L(b1) + jh L(b2) + +L(b0): brct %r10, L(top) + j L(end) + +L(b1): l %r0, 28(up) + ahi up, -4 + st %r0, 28(rp) + ahi rp, -4 + brct %r10, L(top) + j L(end) + +L(b2): lm %r0, %r1, 24(up) + ahi up, -8 + stm %r0, %r1, 24(rp) + ahi rp, -8 + brct %r10, L(top) + j L(end) + +L(b34567): + chi %r7, 4 + jl L(b3) + je L(b4) + chi %r7, 6 + je L(b6) + jh L(b7) + +L(b5): lm %r0, %r4, 12(up) + ahi up, -20 + stm %r0, %r4, 12(rp) + ahi rp, -20 + brct %r10, L(top) + j L(end) + +L(b3): lm %r0, %r2, 20(up) + ahi up, -12 + stm %r0, %r2, 20(rp) + ahi rp, -12 + brct %r10, L(top) + j L(end) + +L(b4): lm %r0, %r3, 16(up) + ahi up, -16 + stm %r0, %r3, 16(rp) + ahi rp, -16 + brct %r10, L(top) + j L(end) + +L(b6): lm %r0, %r5, 8(up) + ahi up, -24 + stm %r0, %r5, 8(rp) + ahi rp, -24 + brct %r10, L(top) + j L(end) + +L(b7): lm %r0, %r6, 4(up) + ahi up, -28 + stm %r0, %r6, 4(rp) + ahi rp, -28 + brct %r10, L(top) + j L(end) + +L(top): lm %r0, %r7, 0(up) + la up, 0(%r11,up) + stm %r0, %r7, 0(rp) + la rp, 0(%r11,rp) + brct %r10, L(top) + +L(end): lm %r6, %r11, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/copyi.asm b/gmp-6.3.0/mpn/s390_32/copyi.asm new file mode 100644 index 0000000..1df32f1 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/copyi.asm @@ -0,0 +1,69 @@ +dnl S/390-32 mpn_copyi + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C z900 0.75 +C z990 0.375 +C z9 ? +C z10 ? +C z196 ? + +C NOTE +C * This is based on GNU libc memcpy which was written by Martin Schwidefsky. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') + +ASM_START() +PROLOGUE(mpn_copyi) + ltr %r4, %r4 + sll %r4, 2 + je L(rtn) + ahi %r4, -1 + lr %r5, %r4 + srl %r5, 8 + ltr %r5, %r5 C < 256 bytes to copy? + je L(1) + +L(top): mvc 0(256, rp), 0(up) + la rp, 256(rp) + la up, 256(up) + brct %r5, L(top) + +L(1): bras %r5, L(2) C make r5 point to mvc insn + mvc 0(1, rp), 0(up) +L(2): ex %r4, 0(%r5) C execute mvc with length ((n-1) mod 256)+1 +L(rtn): br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/esame/addmul_1.asm b/gmp-6.3.0/mpn/s390_32/esame/addmul_1.asm new file mode 100644 index 0000000..4375b74 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/addmul_1.asm @@ -0,0 +1,72 @@ +dnl S/390-32 mpn_addmul_1 for systems with MLR instruction + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 18.5 +C z990 10 +C z9 ? +C z10 ? +C z196 ? + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`v0', `%r5') + +define(`z', `%r9') + +ASM_START() +PROLOGUE(mpn_addmul_1) + stm %r9, %r12, 36(%r15) + lhi %r12, 0 C zero index register + ahi %r12, 0 C clear carry fla + lhi %r11, 0 C clear carry limb + lhi z, 0 C clear carry limb + +L(top): l %r1, 0(%r12,up) + l %r10, 0(%r12,rp) + mlr %r0, v0 + alcr %r1, %r10 + alcr %r0, z + alr %r1, %r11 + lr %r11, %r0 + st %r1, 0(%r12,rp) + la %r12, 4(%r12) + brct n, L(top) + + lhi %r2, 0 + alcr %r2, %r11 + + lm %r9, %r12, 36(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/esame/aors_n.asm b/gmp-6.3.0/mpn/s390_32/esame/aors_n.asm new file mode 100644 index 0000000..98b0dbc --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/aors_n.asm @@ -0,0 +1,137 @@ +dnl S/390-32 mpn_add_n and mpn_sub_n. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 ? +C z990 2.75-3 (fast for even n, slow for odd n) +C z9 ? +C z10 ? +C z196 ? + +C TODO +C * Optimise for small n +C * Use r0 and save/restore one less register +C * Using logops_n's v1 inner loop operand order make the loop about 20% +C faster, at the expense of highly alignment-dependent performance. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`vp', `%r4') +define(`n', `%r5') + +ifdef(`OPERATION_add_n', ` + define(ADSB, al) + define(ADSBCR, alcr) + define(ADSBC, alc) + define(RETVAL,`dnl + lhi %r2, 0 + alcr %r2, %r2') + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADSB, sl) + define(ADSBCR, slbr) + define(ADSBC, slb) + define(RETVAL,`dnl + slbr %r2, %r2 + lcr %r2, %r2') + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) + +ASM_START() +PROLOGUE(func) + stm %r6, %r8, 24(%r15) + + ahi n, 3 + lhi %r7, 3 + lr %r1, n + srl %r1, 2 + nr %r7, n C n mod 4 + je L(b1) + chi %r7, 2 + jl L(b2) + jne L(b0) + +L(b3): lm %r5, %r7, 0(up) + la up, 12(up) + ADSB %r5, 0(vp) + ADSBC %r6, 4(vp) + ADSBC %r7, 8(vp) + la vp, 12(vp) + stm %r5, %r7, 0(rp) + la rp, 12(rp) + brct %r1, L(top) + j L(end) + +L(b0): lm %r5, %r8, 0(up) C This redundant insns is no mistake, + la up, 16(up) C it is needed to make main loop run + ADSB %r5, 0(vp) C fast for n = 0 (mod 4). + ADSBC %r6, 4(vp) + j L(m0) + +L(b1): l %r5, 0(up) + la up, 4(up) + ADSB %r5, 0(vp) + la vp, 4(vp) + st %r5, 0(rp) + la rp, 4(rp) + brct %r1, L(top) + j L(end) + +L(b2): lm %r5, %r6, 0(up) + la up, 8(up) + ADSB %r5, 0(vp) + ADSBC %r6, 4(vp) + la vp, 8(vp) + stm %r5, %r6, 0(rp) + la rp, 8(rp) + brct %r1, L(top) + j L(end) + +L(top): lm %r5, %r8, 0(up) + la up, 16(up) + ADSBC %r5, 0(vp) + ADSBC %r6, 4(vp) +L(m0): ADSBC %r7, 8(vp) + ADSBC %r8, 12(vp) + la vp, 16(vp) + stm %r5, %r8, 0(rp) + la rp, 16(rp) + brct %r1, L(top) + +L(end): RETVAL + lm %r6, %r8, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/esame/aorslsh1_n.asm b/gmp-6.3.0/mpn/s390_32/esame/aorslsh1_n.asm new file mode 100644 index 0000000..f2b222b --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/aorslsh1_n.asm @@ -0,0 +1,173 @@ +dnl S/390-32 mpn_addlsh1_n + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 9.25 +C z990 5 +C z9 ? +C z10 ? +C z196 ? + +C TODO +C * Optimise for small n +C * Compute RETVAL for sublsh1_n less stupidly + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`vp', `%r4') +define(`n', `%r5') + +ifdef(`OPERATION_addlsh1_n',` + define(ADDSUBC, alr) + define(ADDSUBE, alcr) + define(INITCY, `lhi %r13, -1') + define(RETVAL, `alr %r1, %r13 + lhi %r2, 2 + alr %r2, %r1') + define(func, mpn_addlsh1_n) +') +ifdef(`OPERATION_sublsh1_n',` + define(ADDSUBC, slr) + define(ADDSUBE, slbr) + define(INITCY, `lhi %r13, 0') + define(RETVAL, `slr %r1, %r13 + lhi %r2, 1 + alr %r2, %r1') + define(func, mpn_sublsh1_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + +ASM_START() +PROLOGUE(func) + stm %r6, %r13, 24(%r15) + + la %r0, 3(n) + lhi %r7, 3 + srl %r0, 2 + nr %r7, n C n mod 4 + je L(b0) + chi %r7, 2 + jl L(b1) + je L(b2) + +L(b3): lm %r5, %r7, 0(up) + la up, 12(up) + lm %r9, %r11, 0(vp) + la vp, 12(vp) + + alr %r9, %r9 + alcr %r10, %r10 + alcr %r11, %r11 + slbr %r1, %r1 + + ADDSUBC %r5, %r9 + ADDSUBE %r6, %r10 + ADDSUBE %r7, %r11 + slbr %r13, %r13 + + stm %r5, %r7, 0(rp) + la rp, 12(rp) + brct %r0, L(top) + j L(end) + +L(b0): lhi %r1, -1 + INITCY + j L(top) + +L(b1): l %r5, 0(up) + la up, 4(up) + l %r9, 0(vp) + la vp, 4(vp) + + alr %r9, %r9 + slbr %r1, %r1 + ADDSUBC %r5, %r9 + slbr %r13, %r13 + + st %r5, 0(rp) + la rp, 4(rp) + brct %r0, L(top) + j L(end) + +L(b2): lm %r5, %r6, 0(up) + la up, 8(up) + lm %r9, %r10, 0(vp) + la vp, 8(vp) + + alr %r9, %r9 + alcr %r10, %r10 + slbr %r1, %r1 + + ADDSUBC %r5, %r9 + ADDSUBE %r6, %r10 + slbr %r13, %r13 + + stm %r5, %r6, 0(rp) + la rp, 8(rp) + brct %r0, L(top) + j L(end) + +L(top): lm %r9, %r12, 0(vp) + la vp, 16(vp) + + ahi %r1, 1 C restore carry + + alcr %r9, %r9 + alcr %r10, %r10 + alcr %r11, %r11 + alcr %r12, %r12 + + slbr %r1, %r1 C save carry + + lm %r5, %r8, 0(up) + la up, 16(up) + + ahi %r13, 1 C restore carry + + ADDSUBE %r5, %r9 + ADDSUBE %r6, %r10 + ADDSUBE %r7, %r11 + ADDSUBE %r8, %r12 + + slbr %r13, %r13 + + stm %r5, %r8, 0(rp) + la rp, 16(rp) + brct %r0, L(top) + +L(end): + RETVAL + lm %r6, %r13, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/esame/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/s390_32/esame/bdiv_dbm1c.asm new file mode 100644 index 0000000..568a2a4 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/bdiv_dbm1c.asm @@ -0,0 +1,65 @@ +dnl S/390-32 mpn_bdiv_dbm1c for systems with MLR instruction. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 14 +C z990 10 +C z9 ? +C z10 ? +C z196 ? + +C INPUT PARAMETERS +define(`qp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`bd', `%r5') +define(`cy', `%r6') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_dbm1c) + stm %r6, %r7, 24(%r15) + lhi %r7, 0 C zero index register + +L(top): l %r1, 0(%r7,up) + mlr %r0, bd + slr %r6, %r1 + st %r6, 0(%r7,qp) + slbr %r6, %r0 + la %r7, 4(%r7) + brct n, L(top) + + lr %r2, %r6 + lm %r6, %r7, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/esame/gmp-mparam.h b/gmp-6.3.0/mpn/s390_32/esame/gmp-mparam.h new file mode 100644 index 0000000..c0e5046 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/gmp-mparam.h @@ -0,0 +1,177 @@ +/* S/390-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2008-2011, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 4400 MHz IBM z196 running in 32-bit mode */ +/* FFT tuning limit = 0.5M */ +/* Generated by tuneup.c, 2017-01-02, gcc 4.9 */ + +#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 45 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 18 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 6 +#define BMOD_1_TO_MOD_1_THRESHOLD 0 /* always */ + +#define DIV_1_VS_MUL_1_PERCENT 320 + +#define MUL_TOOM22_THRESHOLD 12 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 173 +#define MUL_TOOM8H_THRESHOLD 260 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 83 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 86 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 112 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 69 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 254 +#define SQR_TOOM8_THRESHOLD 406 + +#define MULMID_TOOM42_THRESHOLD 30 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 7 + +#define MUL_FFT_MODF_THRESHOLD 276 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 276, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 13, 7}, { 7, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 8}, { 15, 7}, { 31, 8}, \ + { 19, 7}, { 39, 8}, { 23, 9}, { 15, 8}, \ + { 39, 9}, { 23,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47,10}, \ + { 31, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 7}, { 511, 9}, { 143,10}, { 79, 9}, \ + { 159, 8}, { 319, 9}, { 175, 8}, { 351,10}, \ + { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 351, 9}, \ + { 703, 8}, { 1407,11}, { 191,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 479, 9}, { 959, 8}, \ + { 1919,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 89 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 17, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 9}, { 15, 8}, { 39, 9}, { 23,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 47,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 71, 8}, \ + { 143,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255, 7}, { 511, 9}, { 143,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 175, 8}, \ + { 351, 7}, { 703,10}, { 95, 9}, { 191, 8}, \ + { 383, 9}, { 207, 8}, { 415,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319,10}, \ + { 175, 9}, { 351, 8}, { 703, 7}, { 1407,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 351, 9}, { 703, 8}, { 1407,11}, { 191,10}, \ + { 415, 9}, { 831,11}, { 223,10}, { 479,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 84 +#define SQR_FFT_THRESHOLD 1856 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 5240 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 65 +#define SQRLO_SQR_THRESHOLD 3470 + +#define DC_DIV_QR_THRESHOLD 32 +#define DC_DIVAPPR_Q_THRESHOLD 135 +#define DC_BDIV_QR_THRESHOLD 32 +#define DC_BDIV_Q_THRESHOLD 80 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 139 + +#define BINV_NEWTON_THRESHOLD 179 +#define REDC_1_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 872 +#define MU_DIVAPPR_Q_THRESHOLD 998 +#define MUPI_DIV_QR_THRESHOLD 66 +#define MU_BDIV_QR_THRESHOLD 748 +#define MU_BDIV_Q_THRESHOLD 906 + +#define POWM_SEC_TABLE 9,34,257,946,2913 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 1045 +#define SET_STR_PRECOMPUTE_THRESHOLD 1800 + +#define FAC_DSC_THRESHOLD 77 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 121 +#define HGCD_APPR_THRESHOLD 142 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 389 +#define GCDEXT_DC_THRESHOLD 285 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/s390_32/esame/mul_1.asm b/gmp-6.3.0/mpn/s390_32/esame/mul_1.asm new file mode 100644 index 0000000..04be963 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/mul_1.asm @@ -0,0 +1,66 @@ +dnl S/390-32 mpn_mul_1 for systems with MLR instruction + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 14 +C z990 9 +C z9 ? +C z10 ? +C z196 ? + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`v0', `%r5') + +ASM_START() +PROLOGUE(mpn_mul_1) + stm %r11, %r12, 44(%r15) + lhi %r12, 0 C zero index register + ahi %r12, 0 C clear carry flag + lhi %r11, 0 C clear carry limb + +L(top): l %r1, 0(%r12,up) + mlr %r0, v0 + alcr %r1, %r11 + lr %r11, %r0 C copy high part to carry limb + st %r1, 0(%r12,rp) + la %r12, 4(%r12) + brct n, L(top) + + lhi %r2, 0 + alcr %r2, %r11 + + lm %r11, %r12, 44(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/esame/mul_basecase.asm b/gmp-6.3.0/mpn/s390_32/esame/mul_basecase.asm new file mode 100644 index 0000000..2c8138d --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/mul_basecase.asm @@ -0,0 +1,130 @@ +dnl S/390-32/esame mpn_mul_basecase. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 ? +C z990 ? +C z9 ? +C z10 ? +C z196 ? + +C TODO +C * Perhaps add special case for un <= 2. +C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped +C up by about 10%. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`un', `%r4') +define(`vp', `%r5') +define(`vn', `%r6') + +define(`zero', `%r8') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + chi un, 2 + jhe L(ge2) + +C un = vn = 1 + l %r1, 0(vp) + ml %r0, 0(up) + st %r1, 0(rp) + st %r0, 4(rp) + br %r14 + +L(ge2): C jne L(gen) + + +L(gen): +C mul_1 ======================================================================= + + stm %r6, %r12, 24(%r15) + lhi zero, 0 + ahi un, -1 + + l %r7, 0(vp) + l %r11, 0(up) + lhi %r12, 4 C init index register + mlr %r10, %r7 + lr %r9, un + st %r11, 0(rp) + cr %r15, %r15 C clear carry flag + +L(tm): l %r1, 0(%r12,up) + mlr %r0, %r7 + alcr %r1, %r10 + lr %r10, %r0 C copy high part to carry limb + st %r1, 0(%r12,rp) + la %r12, 4(%r12) + brct %r9, L(tm) + + alcr %r0, zero + st %r0, 0(%r12,rp) + +C addmul_1 loop =============================================================== + + ahi vn, -1 + je L(outer_end) +L(outer_loop): + + la rp, 4(rp) C rp += 1 + la vp, 4(vp) C up += 1 + l %r7, 0(vp) + l %r11, 0(up) + lhi %r12, 4 C init index register + mlr %r10, %r7 + lr %r9, un + al %r11, 0(rp) + st %r11, 0(rp) + +L(tam): l %r1, 0(%r12,up) + l %r11, 0(%r12,rp) + mlr %r0, %r7 + alcr %r1, %r11 + alcr %r0, zero + alr %r1, %r10 + lr %r10, %r0 + st %r1, 0(%r12,rp) + la %r12, 4(%r12) + brct %r9, L(tam) + + alcr %r0, zero + st %r0, 0(%r12,rp) + + brct vn, L(outer_loop) +L(outer_end): + + lm %r6, %r12, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/esame/sqr_basecase.asm b/gmp-6.3.0/mpn/s390_32/esame/sqr_basecase.asm new file mode 100644 index 0000000..f45f87a --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/sqr_basecase.asm @@ -0,0 +1,203 @@ +dnl S/390-32 mpn_sqr_basecase. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 ? +C z990 23 +C z9 ? +C z10 ? +C z196 ? + +C TODO +C * Clean up. +C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail. +C This will ask for basecase handling of n = 3. +C * Update counters and pointers more straightforwardly, possibly lowering +C register usage. +C * Should we use this allocation-free style for more sqr_basecase asm +C implementations? The only disadvantage is that it requires R != U. +C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped +C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even +C more. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') + +define(`zero', `%r8') +define(`rp_saved', `%r9') +define(`up_saved', `%r13') +define(`n_saved', `%r14') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + ahi n, -2 + jhe L(ge2) + +C n = 1 + l %r5, 0(up) + mlr %r4, %r5 + st %r5, 0(rp) + st %r4, 4(rp) + br %r14 + +L(ge2): jne L(gen) + +C n = 2 + stm %r6, %r8, 24(%r15) + lhi zero, 0 + + l %r5, 0(up) + mlr %r4, %r5 C u0 * u0 + l %r1, 4(up) + mlr %r0, %r1 C u1 * u1 + st %r5, 0(rp) + + l %r7, 0(up) + ml %r6, 4(up) C u0 * u1 + alr %r7, %r7 + alcr %r6, %r6 + alcr %r0, zero + + alr %r4, %r7 + alcr %r1, %r6 + alcr %r0, zero + st %r4, 4(rp) + st %r1, 8(rp) + st %r0, 12(rp) + + lm %r6, %r8, 24(%r15) + br %r14 + +L(gen): +C mul_1 ======================================================================= + + stm %r6, %r14, 24(%r15) + lhi zero, 0 + lr up_saved, up + lr rp_saved, rp + lr n_saved, n + + l %r6, 0(up) + l %r11, 4(up) + lhi %r12, 8 C init index register + mlr %r10, %r6 + lr %r5, n + st %r11, 4(rp) + cr %r15, %r15 C clear carry flag + +L(tm): l %r1, 0(%r12,up) + mlr %r0, %r6 + alcr %r1, %r10 + lr %r10, %r0 C copy high part to carry limb + st %r1, 0(%r12,rp) + la %r12, 4(%r12) + brct %r5, L(tm) + + alcr %r0, zero + st %r0, 0(%r12,rp) + +C addmul_1 loop =============================================================== + + ahi n, -1 + je L(outer_end) +L(outer_loop): + + la rp, 8(rp) C rp += 2 + la up, 4(up) C up += 1 + l %r6, 0(up) + l %r11, 4(up) + lhi %r12, 8 C init index register + mlr %r10, %r6 + lr %r5, n + al %r11, 4(rp) + st %r11, 4(rp) + +L(tam): l %r1, 0(%r12,up) + l %r7, 0(%r12,rp) + mlr %r0, %r6 + alcr %r1, %r7 + alcr %r0, zero + alr %r1, %r10 + lr %r10, %r0 + st %r1, 0(%r12,rp) + la %r12, 4(%r12) + brct %r5, L(tam) + + alcr %r0, zero + st %r0, 0(%r12,rp) + + brct n, L(outer_loop) +L(outer_end): + + l %r6, 4(up) + l %r1, 8(up) + lr %r7, %r0 C Same as: l %r7, 12(,rp) + mlr %r0, %r6 + alr %r1, %r7 + alcr %r0, zero + st %r1, 12(rp) + st %r0, 16(rp) + +C sqr_diag_addlsh1 ============================================================ + +define(`up', `up_saved') +define(`rp', `rp_saved') + la n, 1(n_saved) + + l %r1, 0(up) + mlr %r0, %r1 + st %r1, 0(rp) +C clr %r15, %r15 C clear carry (already clear per above) + +L(top): l %r11, 4(up) + la up, 4(up) + l %r6, 4(rp) + l %r7, 8(rp) + mlr %r10, %r11 + alcr %r6, %r6 + alcr %r7, %r7 + alcr %r10, zero C propagate carry to high product limb + alr %r6, %r0 + alcr %r7, %r11 + stm %r6, %r7, 4(rp) + la rp, 8(rp) + lr %r0, %r10 C copy carry limb + brct n, L(top) + + alcr %r0, zero + st %r0, 4(rp) + + lm %r6, %r14, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/esame/submul_1.asm b/gmp-6.3.0/mpn/s390_32/esame/submul_1.asm new file mode 100644 index 0000000..a71e57e --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/esame/submul_1.asm @@ -0,0 +1,70 @@ +dnl S/390-32 mpn_submul_1 for systems with MLR instruction. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 20 +C z990 11 +C z9 ? +C z10 ? +C z196 ? + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`v0', `%r5') + +ASM_START() +PROLOGUE(mpn_submul_1) + stm %r9, %r12, 36(%r15) + lhi %r12, 0 + slr %r11, %r11 + +L(top): l %r1, 0(%r12, up) + l %r10, 0(%r12, rp) + mlr %r0, v0 + slbr %r10, %r1 + slbr %r9, %r9 + slr %r0, %r9 C conditional incr + slr %r10, %r11 + lr %r11, %r0 + st %r10, 0(%r12, rp) + la %r12, 4(%r12) + brct %r4, L(top) + + lr %r2, %r11 + slbr %r9, %r9 + slr %r2, %r9 + + lm %r9, %r12, 36(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/gmp-mparam.h b/gmp-6.3.0/mpn/s390_32/gmp-mparam.h new file mode 100644 index 0000000..1aca74a --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/gmp-mparam.h @@ -0,0 +1,138 @@ +/* S/390-32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 770 MHz IBM z900 running in 32-bit mode, using just traditional insns */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 5 +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 5 +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1U_TO_MOD_1_1_THRESHOLD 15 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 114 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 226 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 106 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 + +#define SQR_BASECASE_THRESHOLD 7 +#define SQR_TOOM2_THRESHOLD 40 +#define SQR_TOOM3_THRESHOLD 126 +#define SQR_TOOM4_THRESHOLD 192 +#define SQR_TOOM6_THRESHOLD 246 +#define SQR_TOOM8_THRESHOLD 357 + +#define MULMID_TOOM42_THRESHOLD 28 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 244, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 8, 5}, { 17, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 19, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 33, 8}, { 19, 7}, { 39, 8}, { 23, 7}, \ + { 47, 8}, { 27, 9}, { 15, 8}, { 39, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39, 8}, { 79, 9}, { 47,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 71, 8}, \ + { 143, 9}, { 79,10}, { 47,11}, { 2048,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 48 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 216 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 216, 5}, { 7, 4}, { 15, 5}, { 17, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39, 8}, { 79, 9}, { 47,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 71, 8}, \ + { 143, 9}, { 79,10}, { 47,11}, { 2048,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 44 +#define SQR_FFT_THRESHOLD 1856 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 5240 + +#define DC_DIV_QR_THRESHOLD 70 +#define DC_DIVAPPR_Q_THRESHOLD 234 +#define DC_BDIV_QR_THRESHOLD 59 +#define DC_BDIV_Q_THRESHOLD 137 + +#define INV_MULMOD_BNM1_THRESHOLD 36 +#define INV_NEWTON_THRESHOLD 327 +#define INV_APPR_THRESHOLD 268 + +#define BINV_NEWTON_THRESHOLD 324 +#define REDC_1_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1099 +#define MU_DIVAPPR_Q_THRESHOLD 1360 +#define MUPI_DIV_QR_THRESHOLD 138 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 1234 + +#define MATRIX22_STRASSEN_THRESHOLD 18 +#define HGCD_THRESHOLD 167 +#define GCD_DC_THRESHOLD 518 +#define GCDEXT_DC_THRESHOLD 378 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 577 +#define SET_STR_PRECOMPUTE_THRESHOLD 1217 diff --git a/gmp-6.3.0/mpn/s390_32/logops_n.asm b/gmp-6.3.0/mpn/s390_32/logops_n.asm new file mode 100644 index 0000000..1f2cd2a --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/logops_n.asm @@ -0,0 +1,295 @@ +dnl S/390-32 logops. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb variant 1 variant 2 variant 3 +C rp!=up rp=up +C z900 ? ? ? ? +C z990 2.5 1 2.75 2.75 +C z9 ? ? ? +C z10 ? ? ? +C z196 ? ? ? + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`vp', `%r4') +define(`nn', `%r5') + +ifdef(`OPERATION_and_n',` + define(`func',`mpn_and_n') + define(`VARIANT_1') + define(`LOGOPC',`nc') + define(`LOGOP',`n')') +ifdef(`OPERATION_andn_n',` + define(`func',`mpn_andn_n') + define(`VARIANT_2') + define(`LOGOP',`n')') +ifdef(`OPERATION_nand_n',` + define(`func',`mpn_nand_n') + define(`VARIANT_3') + define(`LOGOP',`n')') +ifdef(`OPERATION_ior_n',` + define(`func',`mpn_ior_n') + define(`VARIANT_1') + define(`LOGOPC',`oc') + define(`LOGOP',`o')') +ifdef(`OPERATION_iorn_n',` + define(`func',`mpn_iorn_n') + define(`VARIANT_2') + define(`LOGOP',`o')') +ifdef(`OPERATION_nior_n',` + define(`func',`mpn_nior_n') + define(`VARIANT_3') + define(`LOGOP',`o')') +ifdef(`OPERATION_xor_n',` + define(`func',`mpn_xor_n') + define(`VARIANT_1') + define(`LOGOPC',`xc') + define(`LOGOP',`x')') +ifdef(`OPERATION_xnor_n',` + define(`func',`mpn_xnor_n') + define(`VARIANT_2') + define(`LOGOP',`x')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) +ifdef(`VARIANT_1',` + cr rp, up + jne L(normal) + + sll nn, 2 + ahi nn, -1 + lr %r1, nn + srl %r1, 8 + ltr %r1, %r1 C < 256 bytes to copy? + je L(1) + +L(tp): LOGOPC 0(256, rp), 0(vp) + la rp, 256(rp) + la vp, 256(vp) + brct %r1, L(tp) + +L(1): bras %r1, L(2) C make r1 point to mvc insn + LOGOPC 0(1, rp), 0(vp) +L(2): ex nn, 0(%r1) C execute mvc with length ((nn-1) mod 256)+1 +L(rtn): br %r14 + + +L(normal): + stm %r6, %r8, 12(%r15) + ahi nn, 3 + lhi %r7, 3 + lr %r0, nn + srl %r0, 2 + nr %r7, nn C nn mod 4 + je L(b1) + chi %r7, 2 + jl L(b2) + jne L(top) + +L(b3): lm %r5, %r7, 0(up) + la up, 12(up) + LOGOP %r5, 0(vp) + LOGOP %r6, 4(vp) + LOGOP %r7, 8(vp) + stm %r5, %r7, 0(rp) + la rp, 12(rp) + la vp, 12(vp) + j L(mid) + +L(b1): l %r5, 0(up) + la up, 4(up) + LOGOP %r5, 0(vp) + st %r5, 0(rp) + la rp, 4(rp) + la vp, 4(vp) + j L(mid) + +L(b2): lm %r5, %r6, 0(up) + la up, 8(up) + LOGOP %r5, 0(vp) + LOGOP %r6, 4(vp) + stm %r5, %r6, 0(rp) + la rp, 8(rp) + la vp, 8(vp) + j L(mid) + +L(top): lm %r5, %r8, 0(up) + la up, 16(up) + LOGOP %r5, 0(vp) + LOGOP %r6, 4(vp) + LOGOP %r7, 8(vp) + LOGOP %r8, 12(vp) + stm %r5, %r8, 0(rp) + la rp, 16(rp) + la vp, 16(vp) +L(mid): brct %r0, L(top) + + lm %r6, %r8, 12(%r15) + br %r14 +') + +ifdef(`VARIANT_2',` + stm %r6, %r8, 12(%r15) + lhi %r1, -1 + + ahi nn, 3 + lhi %r7, 3 + lr %r0, nn + srl %r0, 2 + nr %r7, nn C nn mod 4 + je L(b1) + chi %r7, 2 + jl L(b2) + jne L(top) + +L(b3): lm %r5, %r7, 0(vp) + la vp, 12(vp) + xr %r5, %r1 + xr %r6, %r1 + xr %r7, %r1 + LOGOP %r5, 0(up) + LOGOP %r6, 4(up) + LOGOP %r7, 8(up) + stm %r5, %r7, 0(rp) + la rp, 12(rp) + la up, 12(up) + j L(mid) + +L(b1): l %r5, 0(vp) + la vp, 4(vp) + xr %r5, %r1 + LOGOP %r5, 0(up) + st %r5, 0(rp) + la rp, 4(rp) + la up, 4(up) + j L(mid) + +L(b2): lm %r5, %r6, 0(vp) + la vp, 8(vp) + xr %r5, %r1 + xr %r6, %r1 + LOGOP %r5, 0(up) + LOGOP %r6, 4(up) + stm %r5, %r6, 0(rp) + la rp, 8(rp) + la up, 8(up) + j L(mid) + +L(top): lm %r5, %r8, 0(vp) + la vp, 16(vp) + xr %r5, %r1 + xr %r6, %r1 + xr %r7, %r1 + xr %r8, %r1 + LOGOP %r5, 0(up) + LOGOP %r6, 4(up) + LOGOP %r7, 8(up) + LOGOP %r8, 12(up) + la up, 16(up) + stm %r5, %r8, 0(rp) + la rp, 16(rp) +L(mid): brct %r0, L(top) + + lm %r6, %r8, 12(%r15) + br %r14 +') + +ifdef(`VARIANT_3',` + stm %r6, %r8, 12(%r15) + lhi %r1, -1 + + ahi nn, 3 + lhi %r7, 3 + lr %r0, nn + srl %r0, 2 + nr %r7, nn C nn mod 4 + je L(b1) + chi %r7, 2 + jl L(b2) + jne L(top) + +L(b3): lm %r5, %r7, 0(vp) + la vp, 12(vp) + LOGOP %r5, 0(up) + LOGOP %r6, 4(up) + xr %r5, %r1 + xr %r6, %r1 + LOGOP %r7, 8(up) + xr %r7, %r1 + stm %r5, %r7, 0(rp) + la rp, 12(rp) + la up, 12(up) + j L(mid) + +L(b1): l %r5, 0(vp) + la vp, 4(vp) + LOGOP %r5, 0(up) + xr %r5, %r1 + st %r5, 0(rp) + la rp, 4(rp) + la up, 4(up) + j L(mid) + +L(b2): lm %r5, %r6, 0(vp) + la vp, 8(vp) + LOGOP %r5, 0(up) + LOGOP %r6, 4(up) + xr %r5, %r1 + xr %r6, %r1 + stm %r5, %r6, 0(rp) + la rp, 8(rp) + la up, 8(up) + j L(mid) + +L(top): lm %r5, %r8, 0(vp) + la vp, 16(vp) + LOGOP %r5, 0(up) + LOGOP %r6, 4(up) + xr %r5, %r1 + xr %r6, %r1 + LOGOP %r7, 8(up) + LOGOP %r8, 12(up) + xr %r7, %r1 + xr %r8, %r1 + stm %r5, %r8, 0(rp) + la up, 16(up) + la rp, 16(rp) +L(mid): brct %r0, L(top) + + lm %r6, %r8, 12(%r15) + br %r14 +') + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/lshift.asm b/gmp-6.3.0/mpn/s390_32/lshift.asm new file mode 100644 index 0000000..da7d76e --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/lshift.asm @@ -0,0 +1,144 @@ +dnl S/390-32 mpn_lshift. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 6 +C z990 3 +C z9 ? +C z10 ? +C z196 ? + +C TODO +C * + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`cnt', `%r5') + +ASM_START() +PROLOGUE(mpn_lshift) + lr %r1, n + sll %r1, 2 + stm %r6, %r12, 24(%r15) + la up, 0(%r1,up) C put up near end of U + la rp, 0(%r1,rp) C put rp near end of R + ahi up, -20 + ahi rp, -16 + lhi %r8, 32 + sr %r8, cnt + l %r12, 16(up) + srl %r12, 0(%r8) C return value + lhi %r7, 3 + nr %r7, n + srl n, 2 + je L(b0) + chi %r7, 2 + jl L(b1) + je L(b2) + +L(b3): l %r10, 16(up) + l %r11, 12(up) + l %r9, 8(up) + ahi up, -8 + lr %r8, %r11 + sldl %r10, 0(cnt) + sldl %r8, 0(cnt) + st %r10, 12(rp) + st %r8, 8(rp) + ahi rp, -8 + ltr n, n + je L(end) + j L(top) + +L(b2): l %r10, 16(up) + l %r11, 12(up) + ahi up, -4 + sldl %r10, 0(cnt) + st %r10, 12(rp) + ahi rp, -4 + ltr n, n + je L(end) + j L(top) + +L(b1): ltr n, n + je L(end) + j L(top) + +L(b0): l %r10,16(up) + l %r8, 12(up) + l %r6, 8(up) + l %r0, 4(up) + ahi up, -12 + lr %r11, %r8 + lr %r9, %r6 + lr %r7, %r0 + sldl %r10,0(cnt) + sldl %r8, 0(cnt) + sldl %r6, 0(cnt) + st %r10, 12(rp) + st %r8, 8(rp) + st %r6, 4(rp) + ahi rp, -12 + ahi n, -1 + je L(end) + + ALIGN(8) +L(top): l %r10, 16(up) + l %r8, 12(up) + l %r6, 8(up) + l %r0, 4(up) + l %r1, 0(up) + lr %r11, %r8 + lr %r9, %r6 + lr %r7, %r0 + ahi up, -16 + sldl %r10, 0(cnt) + sldl %r8, 0(cnt) + sldl %r6, 0(cnt) + sldl %r0, 0(cnt) + st %r10, 12(rp) + st %r8, 8(rp) + st %r6, 4(rp) + st %r0, 0(rp) + ahi rp, -16 + brct n, L(top) + +L(end): l %r10, 16(up) + sll %r10, 0(cnt) + st %r10, 12(rp) + + lr %r2, %r12 + lm %r6, %r12, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/lshiftc.asm b/gmp-6.3.0/mpn/s390_32/lshiftc.asm new file mode 100644 index 0000000..f601673 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/lshiftc.asm @@ -0,0 +1,156 @@ +dnl S/390-32 mpn_lshiftc. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 7 +C z990 3.375 +C z9 ? +C z10 ? +C z196 ? + +C TODO +C * + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`cnt', `%r5') + +ASM_START() +PROLOGUE(mpn_lshiftc) + lr %r1, n + sll %r1, 2 + stm %r6, %r13, 24(%r15) + la up, 0(%r1,up) C put up near end of U + la rp, 0(%r1,rp) C put rp near end of R + ahi up, -20 + ahi rp, -16 + lhi %r8, 32 + sr %r8, cnt + l %r12, 16(up) + srl %r12, 0(%r8) C return value + lhi %r13, -1 + lhi %r7, 3 + nr %r7, n + srl n, 2 + je L(b0) + chi %r7, 2 + jl L(b1) + je L(b2) + +L(b3): l %r10, 16(up) + l %r11, 12(up) + l %r9, 8(up) + ahi up, -8 + lr %r8, %r11 + sldl %r10, 0(cnt) + sldl %r8, 0(cnt) + xr %r10, %r13 + xr %r8, %r13 + st %r10, 12(rp) + st %r8, 8(rp) + ahi rp, -8 + ltr n, n + je L(end) + j L(top) + +L(b2): l %r10, 16(up) + l %r11, 12(up) + ahi up, -4 + sldl %r10, 0(cnt) + xr %r10, %r13 + st %r10, 12(rp) + ahi rp, -4 + ltr n, n + je L(end) + j L(top) + +L(b1): ltr n, n + je L(end) + j L(top) + +L(b0): l %r10,16(up) + l %r8, 12(up) + l %r6, 8(up) + l %r0, 4(up) + ahi up, -12 + lr %r11, %r8 + lr %r9, %r6 + lr %r7, %r0 + sldl %r10,0(cnt) + sldl %r8, 0(cnt) + sldl %r6, 0(cnt) + xr %r10, %r13 + xr %r8, %r13 + xr %r6, %r13 + st %r10, 12(rp) + st %r8, 8(rp) + st %r6, 4(rp) + ahi rp, -12 + ahi n, -1 + je L(end) + + ALIGN(8) +L(top): l %r10, 16(up) + l %r8, 12(up) + l %r6, 8(up) + l %r0, 4(up) + l %r1, 0(up) + lr %r11, %r8 + lr %r9, %r6 + lr %r7, %r0 + ahi up, -16 + sldl %r10, 0(cnt) + sldl %r8, 0(cnt) + sldl %r6, 0(cnt) + sldl %r0, 0(cnt) + xr %r10, %r13 + xr %r8, %r13 + xr %r6, %r13 + xr %r0, %r13 + st %r10, 12(rp) + st %r8, 8(rp) + st %r6, 4(rp) + st %r0, 0(rp) + ahi rp, -16 + brct n, L(top) + +L(end): l %r10, 16(up) + sll %r10, 0(cnt) + xr %r10, %r13 + st %r10, 12(rp) + + lr %r2, %r12 + lm %r6, %r13, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/mul_1.asm b/gmp-6.3.0/mpn/s390_32/mul_1.asm new file mode 100644 index 0000000..e3ad0c5 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/mul_1.asm @@ -0,0 +1,85 @@ +dnl S/390 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(`rp',2) +define(`up',3) +define(`n',4) +define(`vlimb',5) +define(`cylimb',7) + +ASM_START() +PROLOGUE(mpn_mul_1) + stm 6,7,24(15) + slr cylimb,cylimb # clear cylimb + ltr vlimb,vlimb + jnl .Loopp + +.Loopn: l 1,0(up) # load from u + lr 6,1 # + mr 0,vlimb # multiply signed + alr 0,6 # add vlimb to phi + sra 6,31 # make mask + nr 6,vlimb # 0 or vlimb + alr 0,6 # conditionally add vlimb to phi + alr 1,cylimb # add carry limb to plo + brc 8+4,+8 # branch if not carry + ahi 0,1 # increment phi + lr cylimb,0 # new cylimb + st 1,0(rp) # store + la up,4(,up) + la rp,4(,rp) + brct n,.Loopn + + lr 2,cylimb + lm 6,7,24(15) + br 14 + +.Loopp: l 1,0(up) # load from u + lr 6,1 # + mr 0,vlimb # multiply signed + sra 6,31 # make mask + nr 6,vlimb # 0 or vlimb + alr 0,6 # conditionally add vlimb to phi + alr 1,cylimb # add carry limb to plo + brc 8+4,+8 # branch if not carry + ahi 0,1 # increment phi + lr cylimb,0 # new cylimb + st 1,0(rp) # store + la up,4(,up) + la rp,4(,rp) + brct n,.Loopp + + lr 2,cylimb + lm 6,7,24(15) + br 14 +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/s390_32/rshift.asm b/gmp-6.3.0/mpn/s390_32/rshift.asm new file mode 100644 index 0000000..5f2cf37 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/rshift.asm @@ -0,0 +1,138 @@ +dnl S/390-32 mpn_rshift. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 6 +C z990 3 +C z9 ? +C z10 ? +C z196 ? + +C TODO +C * + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`cnt', `%r5') + +ASM_START() +PROLOGUE(mpn_rshift) + stm %r6, %r12, 24(%r15) + lhi %r8, 32 + sr %r8, cnt + l %r12, 0(up) + sll %r12, 0(%r8) C return value + lhi %r7, 3 + nr %r7, n + srl n, 2 + je L(b0) + chi %r7, 2 + jl L(b1) + je L(b2) + +L(b3): l %r11, 0(up) + l %r10, 4(up) + l %r8, 8(up) + ahi up, 8 + lr %r9, %r10 + srdl %r10, 0(cnt) + srdl %r8, 0(cnt) + st %r11, 0(rp) + st %r9, 4(rp) + ahi rp, 8 + ltr n, n + je L(end) + j L(top) + +L(b2): l %r11, 0(up) + l %r10, 4(up) + ahi up, 4 + srdl %r10, 0(cnt) + st %r11, 0(rp) + ahi rp, 4 + ltr n, n + je L(end) + j L(top) + +L(b1): ltr n, n + je L(end) + j L(top) + +L(b0): l %r11, 0(up) + l %r9, 4(up) + l %r7, 8(up) + l %r1, 12(up) + ahi up, 12 + lr %r10, %r9 + lr %r8, %r7 + lr %r6, %r1 + srdl %r10, 0(cnt) + srdl %r8, 0(cnt) + srdl %r6, 0(cnt) + st %r11, 0(rp) + st %r9, 4(rp) + st %r7, 8(rp) + ahi rp, 12 + ahi n, -1 + je L(end) + + ALIGN(8) +L(top): l %r11, 0(up) + l %r9, 4(up) + l %r7, 8(up) + l %r1, 12(up) + l %r0, 16(up) + lr %r10, %r9 + lr %r8, %r7 + lr %r6, %r1 + ahi up, 16 + srdl %r10, 0(cnt) + srdl %r8, 0(cnt) + srdl %r6, 0(cnt) + srdl %r0, 0(cnt) + st %r11, 0(rp) + st %r9, 4(rp) + st %r7, 8(rp) + st %r1, 12(rp) + ahi rp, 16 + brct n, L(top) + +L(end): l %r11, 0(up) + srl %r11, 0(cnt) + st %r11, 0(rp) + + lr %r2, %r12 + lm %r6, %r12, 24(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/sec_tabselect.asm b/gmp-6.3.0/mpn/s390_32/sec_tabselect.asm new file mode 100644 index 0000000..c8aa25e --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/sec_tabselect.asm @@ -0,0 +1,140 @@ +dnl S/390-64 mpn_sec_tabselect + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 ? +C z990 ? +C z9 ? +C z10 ? +C z196 ? +C z13 ? +C z14 ? +C z15 ? + +dnl void +dnl mpn_sec_tabselect (volatile mp_limb_t *rp, volatile const mp_limb_t *tab, +dnl mp_size_t n, mp_size_t nents, mp_size_t which) + +define(`rp', `%r2') +define(`tp', `%r3') +define(`n', `%r4') +define(`nents', `%r5') +define(`which_arg',`%r6') C magicked to stack + +dnl r0 r1 r2 r3 r4 r5 r6 r7 +dnl r8 r9 r10 r11 r12 r13 r14 r15 + +define(`mask', `%r14') +define(`k', `%r1') +define(`which', `%r0') + +define(`FRAME', 32) + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + stm %r5, %r15, 20(%r15) + ahi %r15, -FRAME + + sll n, 2 + msr %r5, n + st %r5, 16(%r15) C nents * n * LIMB_BYTES + + lr %r5, n + srl %r5, 2+2 + nr %r5, %r5 + je L(end4) +L(outer): + l which, eval(24+FRAME)(%r15) + l k, eval(20+FRAME)(%r15) C nents + lhi %r6, 0 + lhi %r7, 0 + lhi %r8, 0 + lhi %r9, 0 +L(tp4): lhi mask, 1 + slr which, mask + slbr mask, mask + lm %r10, %r13, 0(tp) + nr %r10, mask + nr %r11, mask + nr %r12, mask + nr %r13, mask + ar %r6, %r10 + ar %r7, %r11 + ar %r8, %r12 + ar %r9, %r13 + ar tp, n + brct k, L(tp4) + stm %r6, %r9, 0(rp) + ahi rp, 16 + sl tp, 16(%r15) + ahi tp, eval(4*4) + brct %r5, L(outer) +L(end4): + tmll n, 8 + je L(end2) + l which, eval(24+FRAME)(%r15) + l k, eval(20+FRAME)(%r15) C nents + lhi %r6, 0 + lhi %r7, 0 +L(tp2): lhi mask, 1 + slr which, mask + slbr mask, mask + lm %r10, %r11, 0(tp) + nr %r10, mask + nr %r11, mask + ar %r6, %r10 + ar %r7, %r11 + ar tp, n + brct k, L(tp2) + stm %r6, %r7, 0(rp) + ahi rp, 8 + sl tp, 16(%r15) + ahi tp, eval(2*4) +L(end2): + tmll n, 4 + je L(end1) + l which, eval(24+FRAME)(%r15) + l k, eval(20+FRAME)(%r15) C nents + lhi %r6, 0 +L(tp1): lhi mask, 1 + slr which, mask + slbr mask, mask + l %r10, 0(tp) + nr %r10, mask + ar %r6, %r10 + ar tp, n + brct k, L(tp1) + st %r6, 0(rp) +L(end1): + lm %r5, %r15, eval(20+FRAME)(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_32/submul_1.asm b/gmp-6.3.0/mpn/s390_32/submul_1.asm new file mode 100644 index 0000000..da7d849 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_32/submul_1.asm @@ -0,0 +1,93 @@ +dnl S/390 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the +dnl result from a second limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(`rp',2) +define(`up',3) +define(`n',4) +define(`vlimb',5) +define(`cylimb',7) + +ASM_START() +PROLOGUE(mpn_submul_1) + stm 6,7,24(15) + slr cylimb,cylimb # clear cylimb + ltr vlimb,vlimb + jnl .Loopp + +.Loopn: l 1,0(up) # load from u + lr 6,1 # + mr 0,vlimb # multiply signed + alr 0,6 # add vlimb to phi + sra 6,31 # make mask + nr 6,vlimb # 0 or vlimb + alr 0,6 # conditionally add vlimb to phi + alr 1,cylimb # add carry limb to plo + brc 8+4,+8 # branch if not carry + ahi 0,1 # increment phi + l 6,0(rp) # load r limb + slr 6,1 # add u limb to plo + brc 2+1,+8 # branch if not carry + ahi 0,1 # increment phi + lr cylimb,0 # new cylimb + st 6,0(rp) # store + la up,4(,up) + la rp,4(,rp) + brct n,.Loopn + + lr 2,cylimb + lm 6,7,24(15) + br 14 + +.Loopp: l 1,0(up) # load from u + lr 6,1 # + mr 0,vlimb # multiply signed + sra 6,31 # make mask + nr 6,vlimb # 0 or vlimb + alr 0,6 # conditionally add vlimb to phi + alr 1,cylimb # add carry limb to plo + brc 8+4,+8 # branch if not carry + ahi 0,1 # increment phi + l 6,0(rp) # load r limb + slr 6,1 # add u limb to plo + brc 2+1,+8 # branch if not carry + ahi 0,1 # increment phi + lr cylimb,0 # new cylimb + st 6,0(rp) # store + la up,4(,up) + la rp,4(,rp) + brct n,.Loopp + + lr 2,cylimb + lm 6,7,24(15) + br 14 +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/s390_64/README b/gmp-6.3.0/mpn/s390_64/README new file mode 100644 index 0000000..53702db --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/README @@ -0,0 +1,88 @@ +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + +There are 5 generations of 64-bit s390 processors, z900, z990, z9, +z10, and z196. The current GMP code was optimised for the two oldest, +z900 and z990. + + +mpn_copyi + +This code makes use of a loop around MVC. It almost surely runs very +close to optimally. A small improvement could be done by using one +MVC for size 256 bytes, now we use two (we use an extra MVC when +copying any multiple of 256 bytes). + + +mpn_copyd + +We have tried several feed-in variants here, branch tree, jump table +and computed goto. The fastest (on z990) turned out to be computed +goto. + +An approach not tried is EX of LMG and STMG, modifying the register set +on-the-fly. Using that trick, we could completely avoid using +separate feed-in paths. + + +mpn_lshift, mpn_rshift + +The current code runs at pipeline decode bandwidth on z990. + + +mpn_add_n, mpn_sub_n + +The current code is 4-way unrolled. It should be unrolled more, at +least 8x, in order to reach 2.5 c/l. + + +mpn_mul_1, mpn_addmul_1, mpn_submul_1 + +The current code is very naive, but due to the non-pipelined nature of +MLGR on z900 and z990, more sophisticated code would not gain much. + +On z10 one would need to cluster at least 4 MLGR together, in order to +reduce stalling. + +On z196, one surely want to use unrolling and pipelining, to perhaps +reach around 12 c/l. A major issue here and on z10 is ALCGR's 3 cycle +stalling. + + +mpn_mul_2, mpn_addmul_2 + +At least for older machines (z900, z990) with very slow MLGR, we +should use Karatsuba's algorithm on 2-limb units, making mul_2 and +addmul_2 the main multiplication primitives. The newer machines might +benefit less from this approach, perhaps in particular z10, where MLGR +clustering is more important. + +With Karatsuba, one could hope for around 16 cycles per accumulated +128 cross product, on z990. diff --git a/gmp-6.3.0/mpn/s390_64/addmul_1.asm b/gmp-6.3.0/mpn/s390_64/addmul_1.asm new file mode 100644 index 0000000..84cca12 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/addmul_1.asm @@ -0,0 +1,72 @@ +dnl S/390-64 mpn_addmul_1 + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 34 +C z990 23 +C z9 ? +C z10 28 +C z196 ? + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`v0', `%r5') + +define(`z', `%r9') + +ASM_START() +PROLOGUE(mpn_addmul_1) + stmg %r9, %r12, 72(%r15) + lghi %r12, 0 C zero index register + aghi %r12, 0 C clear carry flag + lghi %r11, 0 C clear carry limb + lghi z, 0 C keep register zero + +L(top): lg %r1, 0(%r12,up) + lg %r10, 0(%r12,rp) + mlgr %r0, v0 + alcgr %r1, %r10 + alcgr %r0, z + algr %r1, %r11 + lgr %r11, %r0 + stg %r1, 0(%r12,rp) + la %r12, 8(%r12) + brctg n, L(top) + + lghi %r2, 0 + alcgr %r2, %r11 + + lmg %r9, %r12, 72(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/aorrlsh1_n.asm b/gmp-6.3.0/mpn/s390_64/aorrlsh1_n.asm new file mode 100644 index 0000000..697259e --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/aorrlsh1_n.asm @@ -0,0 +1,168 @@ +dnl S/390-64 mpn_addlsh1_n and mpn_rsblsh1_n. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 9 +C z990 4.75 +C z9 ? +C z10 11 +C z196 ? + +C TODO +C * Optimise for small n, avoid 'la' like in aors_n.asm. +C * Tune to reach 3.5 c/l. For addlsh1, we could let the main alcgr propagate +C carry to the lsh1 alcgr. +C * Compute RETVAL for sublsh1_n less stupidly. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`vp', `%r4') +define(`n', `%r5') + +ifdef(`OPERATION_addlsh1_n',` + define(ADSB, alg) + define(ADSBC, alcg) + define(INITCY, `lghi %r9, -1') + define(RETVAL, `la %r2, 2(%r1,%r9)') + define(func, mpn_addlsh1_n) +') +ifdef(`OPERATION_rsblsh1_n',` + define(ADSB, slg) + define(ADSBC, slbg) + define(INITCY, `lghi %r9, 0') + define(RETVAL,`dnl + algr %r1, %r9 + lghi %r2, 1 + algr %r2, %r1') + define(func, mpn_rsblsh1_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ASM_START() +PROLOGUE(func) + stmg %r6, %r9, 48(%r15) + + aghi n, 3 + lghi %r7, 3 + srlg %r0, n, 2 + ngr %r7, n C n mod 4 + je L(b1) + cghi %r7, 2 + jl L(b2) + jne L(b0) + +L(b3): lmg %r5, %r7, 0(vp) + la vp, 24(vp) + + algr %r5, %r5 + alcgr %r6, %r6 + alcgr %r7, %r7 + slbgr %r1, %r1 + + ADSB %r5, 0(up) + ADSBC %r6, 8(up) + ADSBC %r7, 16(up) + la up, 24(up) + slbgr %r9, %r9 + + stmg %r5, %r7, 0(rp) + la rp, 24(rp) + brctg %r0, L(top) + j L(end) + +L(b0): lghi %r1, -1 + INITCY + j L(top) + +L(b1): lg %r5, 0(vp) + la vp, 8(vp) + + algr %r5, %r5 + slbgr %r1, %r1 + ADSB %r5, 0(up) + la up, 8(up) + slbgr %r9, %r9 + + stg %r5, 0(rp) + la rp, 8(rp) + brctg %r0, L(top) + j L(end) + +L(b2): lmg %r5, %r6, 0(vp) + la vp, 16(vp) + + algr %r5, %r5 + alcgr %r6, %r6 + slbgr %r1, %r1 + + ADSB %r5, 0(up) + ADSBC %r6, 8(up) + la up, 16(up) + slbgr %r9, %r9 + + stmg %r5, %r6, 0(rp) + la rp, 16(rp) + brctg %r0, L(top) + j L(end) + +L(top): lmg %r5, %r8, 0(vp) + la vp, 32(vp) + + aghi %r1, 1 C restore carry + + alcgr %r5, %r5 + alcgr %r6, %r6 + alcgr %r7, %r7 + alcgr %r8, %r8 + + slbgr %r1, %r1 C save carry + + aghi %r9, 1 C restore carry + + ADSBC %r5, 0(up) + ADSBC %r6, 8(up) + ADSBC %r7, 16(up) + ADSBC %r8, 24(up) + la up, 32(up) + + slbgr %r9, %r9 C save carry + + stmg %r5, %r8, 0(rp) + la rp, 32(rp) + brctg %r0, L(top) + +L(end): RETVAL + lmg %r6, %r9, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/aors_n.asm b/gmp-6.3.0/mpn/s390_64/aors_n.asm new file mode 100644 index 0000000..a3c3ca7 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/aors_n.asm @@ -0,0 +1,136 @@ +dnl S/390-64 mpn_add_n and mpn_sub_n. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 5.5 +C z990 3 +C z9 ? +C z10 6 +C z196 ? + +C TODO +C * Optimise for small n +C * Use r0 and save/restore one less register +C * Using logops_n's v1 inner loop operand order make the loop about 20% +C faster, at the expense of highly alignment-dependent performance. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`vp', `%r4') +define(`n', `%r5') + +ifdef(`OPERATION_add_n', ` + define(ADSB, alg) + define(ADSBCR, alcgr) + define(ADSBC, alcg) + define(RETVAL,`dnl + lghi %r2, 0 + alcgr %r2, %r2') + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADSB, slg) + define(ADSBCR, slbgr) + define(ADSBC, slbg) + define(RETVAL,`dnl + slbgr %r2, %r2 + lcgr %r2, %r2') + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) + +ASM_START() +PROLOGUE(func) + stmg %r6, %r8, 48(%r15) + + aghi n, 3 + lghi %r7, 3 + srlg %r1, n, 2 + ngr %r7, n C n mod 4 + je L(b1) + cghi %r7, 2 + jl L(b2) + jne L(b0) + +L(b3): lmg %r5, %r7, 0(up) + la up, 24(up) + ADSB %r5, 0(vp) + ADSBC %r6, 8(vp) + ADSBC %r7, 16(vp) + la vp, 24(vp) + stmg %r5, %r7, 0(rp) + la rp, 24(rp) + brctg %r1, L(top) + j L(end) + +L(b0): lmg %r5, %r8, 0(up) C This redundant insns is no mistake, + la up, 32(up) C it is needed to make main loop run + ADSB %r5, 0(vp) C fast for n = 0 (mod 4). + ADSBC %r6, 8(vp) + j L(m0) + +L(b1): lg %r5, 0(up) + la up, 8(up) + ADSB %r5, 0(vp) + la vp, 8(vp) + stg %r5, 0(rp) + la rp, 8(rp) + brctg %r1, L(top) + j L(end) + +L(b2): lmg %r5, %r6, 0(up) + la up, 16(up) + ADSB %r5, 0(vp) + ADSBC %r6, 8(vp) + la vp, 16(vp) + stmg %r5, %r6, 0(rp) + la rp, 16(rp) + brctg %r1, L(top) + j L(end) + +L(top): lmg %r5, %r8, 0(up) + la up, 32(up) + ADSBC %r5, 0(vp) + ADSBC %r6, 8(vp) +L(m0): ADSBC %r7, 16(vp) + ADSBC %r8, 24(vp) + la vp, 32(vp) + stmg %r5, %r8, 0(rp) + la rp, 32(rp) + brctg %r1, L(top) + +L(end): RETVAL + lmg %r6, %r8, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/s390_64/bdiv_dbm1c.asm new file mode 100644 index 0000000..35e900a --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/bdiv_dbm1c.asm @@ -0,0 +1,65 @@ +dnl S/390-64 mpn_bdiv_dbm1c + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 29 +C z990 22 +C z9 ? +C z10 19 +C z196 ? + +C INPUT PARAMETERS +define(`qp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`bd', `%r5') +define(`cy', `%r6') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_dbm1c) + stmg %r6, %r7, 48(%r15) + lghi %r7, 0 C zero index register + +L(top): lg %r1, 0(%r7,up) + mlgr %r0, bd + slgr %r6, %r1 + stg %r6, 0(%r7,qp) + la %r7, 8(%r7) + slbgr %r6, %r0 + brctg n, L(top) + + lgr %r2, %r6 + lmg %r6, %r7, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/copyd.asm b/gmp-6.3.0/mpn/s390_64/copyd.asm new file mode 100644 index 0000000..8631e19 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/copyd.asm @@ -0,0 +1,144 @@ +dnl S/390-64 mpn_copyd + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C z900 2.67 +C z990 1.5 +C z9 ? +C z10 1.8 +C z196 ? + +C FIXME: +C * Avoid saving/restoring callee-saves registers for n < 3. This could be +C done by setting rp=r1, up=r2, i=r0 and r3,r4,r5 for clock regs. +C We could then use r3...r10 in main loop. +C * Could we use some EX trick, modifying lmg/stmg, for the feed-in code? + +C INPUT PARAMETERS +define(`rp_param', `%r2') +define(`up_param', `%r3') +define(`n', `%r4') + +define(`rp', `%r8') +define(`up', `%r9') + +ASM_START() +PROLOGUE(mpn_copyd) + stmg %r6, %r11, 48(%r15) + + sllg %r1, n, 3 + la %r10, 8(n) + aghi %r1, -64 + srlg %r10, %r10, 3 + lghi %r11, -64 + + la rp, 0(%r1,rp_param) C FIXME use lay on z990 and later + la up, 0(%r1,up_param) C FIXME use lay on z990 and later + + lghi %r7, 7 + ngr %r7, n C n mod 8 + cghi %r7, 2 + jh L(b34567) + cghi %r7, 1 + je L(b1) + jh L(b2) + +L(b0): brctg %r10, L(top) + j L(end) + +L(b1): lg %r0, 56(up) + aghi up, -8 + stg %r0, 56(rp) + aghi rp, -8 + brctg %r10, L(top) + j L(end) + +L(b2): lmg %r0, %r1, 48(up) + aghi up, -16 + stmg %r0, %r1, 48(rp) + aghi rp, -16 + brctg %r10, L(top) + j L(end) + +L(b34567): + cghi %r7, 4 + jl L(b3) + je L(b4) + cghi %r7, 6 + je L(b6) + jh L(b7) + +L(b5): lmg %r0, %r4, 24(up) + aghi up, -40 + stmg %r0, %r4, 24(rp) + aghi rp, -40 + brctg %r10, L(top) + j L(end) + +L(b3): lmg %r0, %r2, 40(up) + aghi up, -24 + stmg %r0, %r2, 40(rp) + aghi rp, -24 + brctg %r10, L(top) + j L(end) + +L(b4): lmg %r0, %r3, 32(up) + aghi up, -32 + stmg %r0, %r3, 32(rp) + aghi rp, -32 + brctg %r10, L(top) + j L(end) + +L(b6): lmg %r0, %r5, 16(up) + aghi up, -48 + stmg %r0, %r5, 16(rp) + aghi rp, -48 + brctg %r10, L(top) + j L(end) + +L(b7): lmg %r0, %r6, 8(up) + aghi up, -56 + stmg %r0, %r6, 8(rp) + aghi rp, -56 + brctg %r10, L(top) + j L(end) + +L(top): lmg %r0, %r7, 0(up) + la up, 0(%r11,up) + stmg %r0, %r7, 0(rp) + la rp, 0(%r11,rp) + brctg %r10, L(top) + +L(end): lmg %r6, %r11, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/copyi.asm b/gmp-6.3.0/mpn/s390_64/copyi.asm new file mode 100644 index 0000000..bfb8881 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/copyi.asm @@ -0,0 +1,68 @@ +dnl S/390-64 mpn_copyi + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C z900 1.25 +C z990 0.75 +C z9 ? +C z10 1 +C z196 ? + +C NOTE +C * This is based on GNU libc memcpy which was written by Martin Schwidefsky. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') + +ASM_START() +PROLOGUE(mpn_copyi) + ltgr %r4, %r4 + sllg %r4, %r4, 3 + je L(rtn) + aghi %r4, -1 + srlg %r5, %r4, 8 + ltgr %r5, %r5 C < 256 bytes to copy? + je L(1) + +L(top): mvc 0(256, rp), 0(up) + la rp, 256(rp) + la up, 256(up) + brctg %r5, L(top) + +L(1): bras %r5, L(2) C make r5 point to mvc insn + mvc 0(1, rp), 0(up) +L(2): ex %r4, 0(%r5) C execute mvc with length ((n-1) mod 256)+1 +L(rtn): br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/gmp-mparam.h b/gmp-6.3.0/mpn/s390_64/gmp-mparam.h new file mode 100644 index 0000000..062c3d2 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/gmp-mparam.h @@ -0,0 +1,181 @@ +/* S/390-64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 4400 MHz z196 */ +/* Generated by tuneup.c, 2017-01-02, gcc 4.9 */ + +#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 14 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 15 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 31 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 2 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 10 +#define DIVEXACT_1_THRESHOLD 4 +#define BMOD_1_TO_MOD_1_THRESHOLD 0 /* always */ + +#define DIV_1_VS_MUL_1_PERCENT 317 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 45 +#define MUL_TOOM44_THRESHOLD 121 +#define MUL_TOOM6H_THRESHOLD 177 +#define MUL_TOOM8H_THRESHOLD 260 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 78 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 118 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 13 +#define SQR_TOOM3_THRESHOLD 89 +#define SQR_TOOM4_THRESHOLD 242 +#define SQR_TOOM6_THRESHOLD 363 +#define SQR_TOOM8_THRESHOLD 482 + +#define MULMID_TOOM42_THRESHOLD 38 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 236 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 236, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 17, 8}, { 9, 7}, { 19, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287, 7}, \ + { 575, 9}, { 159,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415, 8}, { 831,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575, 8}, \ + { 1151,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351, 9}, { 703, 8}, { 1407,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,11}, { 223,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 99 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 220, 5}, { 7, 4}, { 15, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 23, 9}, { 15, 8}, \ + { 31, 9}, { 19, 8}, { 39, 9}, { 23,10}, \ + { 15, 9}, { 39,10}, { 23,11}, { 15,10}, \ + { 31, 9}, { 63,10}, { 47,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255,10}, { 71, 9}, \ + { 143, 8}, { 287,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287, 8}, { 575, 7}, { 1151,10}, \ + { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \ + { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575, 8}, \ + { 1151,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351, 9}, { 703,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 94 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 38 +#define MULLO_MUL_N_THRESHOLD 4392 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 54 +#define SQRLO_SQR_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 42 +#define DC_DIVAPPR_Q_THRESHOLD 148 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 107 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 163 +#define INV_APPR_THRESHOLD 131 + +#define BINV_NEWTON_THRESHOLD 183 +#define REDC_1_TO_REDC_N_THRESHOLD 43 + +#define MU_DIV_QR_THRESHOLD 807 +#define MU_DIVAPPR_Q_THRESHOLD 942 +#define MUPI_DIV_QR_THRESHOLD 78 +#define MU_BDIV_QR_THRESHOLD 680 +#define MU_BDIV_Q_THRESHOLD 828 + +#define POWM_SEC_TABLE 3,35,285,1603 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 1391 +#define SET_STR_PRECOMPUTE_THRESHOLD 2872 + +#define FAC_DSC_THRESHOLD 151 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 135 +#define HGCD_APPR_THRESHOLD 169 +#define HGCD_REDUCE_THRESHOLD 1437 +#define GCD_DC_THRESHOLD 469 +#define GCDEXT_DC_THRESHOLD 342 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/s390_64/invert_limb.asm b/gmp-6.3.0/mpn/s390_64/invert_limb.asm new file mode 100644 index 0000000..edcebdd --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/invert_limb.asm @@ -0,0 +1,94 @@ +dnl S/390-64 mpn_invert_limb + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 142 +C z990 86 +C z9 ? +C z10 120 +C z196 ? + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_invert_limb) + stg %r9, 72(%r15) + srlg %r9, %r2, 55 + agr %r9, %r9 + larl %r4, approx_tab-512 + srlg %r3, %r2, 24 + aghi %r3, 1 + lghi %r5, 1 + llgh %r4, 0(%r9, %r4) + sllg %r9, %r4, 11 + msgr %r4, %r4 + msgr %r4, %r3 + srlg %r4, %r4, 40 + aghi %r9, -1 + sgr %r9, %r4 + sllg %r0, %r9, 60 + sllg %r1, %r9, 13 + msgr %r9, %r9 + msgr %r9, %r3 + sgr %r0, %r9 + ngr %r5, %r2 + srlg %r4, %r2, 1 + srlg %r3, %r0, 47 + agr %r3, %r1 + agr %r4, %r5 + msgr %r4, %r3 + srlg %r1, %r3, 1 + lcgr %r5, %r5 + ngr %r1, %r5 + sgr %r1, %r4 + mlgr %r0, %r3 + srlg %r9, %r0, 1 + sllg %r4, %r3, 31 + agr %r4, %r9 + lgr %r1, %r4 + mlgr %r0, %r2 + algr %r1, %r2 + alcgr %r0, %r2 + lgr %r2, %r4 + sgr %r2, %r0 + lg %r9, 72(%r15) + br %r14 +EPILOGUE() + RODATA + ALIGN(2) +approx_tab: +forloop(i,256,512-1,dnl +` .word eval(0x7fd00/i) +')dnl +ASM_END() diff --git a/gmp-6.3.0/mpn/s390_64/logops_n.asm b/gmp-6.3.0/mpn/s390_64/logops_n.asm new file mode 100644 index 0000000..914cfb6 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/logops_n.asm @@ -0,0 +1,291 @@ +dnl S/390-64 logops. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb variant 1 variant 2 variant 3 +C rp!=up rp=up +C z900 4.5 2.25 5.5 5.5 +C z990 2.75 2 3.25 3.25 +C z9 ? ? ? +C z10 3.25 3.75 3.75 +C z196 ? ? ? + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`vp', `%r4') +define(`n', `%r5') + +ifdef(`OPERATION_and_n',` + define(`func',`mpn_and_n') + define(`VARIANT_1') + define(`LOGOPC',`nc') + define(`LOGOP',`ng')') +ifdef(`OPERATION_andn_n',` + define(`func',`mpn_andn_n') + define(`VARIANT_2') + define(`LOGOP',`ng')') +ifdef(`OPERATION_nand_n',` + define(`func',`mpn_nand_n') + define(`VARIANT_3') + define(`LOGOP',`ng')') +ifdef(`OPERATION_ior_n',` + define(`func',`mpn_ior_n') + define(`VARIANT_1') + define(`LOGOPC',`oc') + define(`LOGOP',`og')') +ifdef(`OPERATION_iorn_n',` + define(`func',`mpn_iorn_n') + define(`VARIANT_2') + define(`LOGOP',`og')') +ifdef(`OPERATION_nior_n',` + define(`func',`mpn_nior_n') + define(`VARIANT_3') + define(`LOGOP',`og')') +ifdef(`OPERATION_xor_n',` + define(`func',`mpn_xor_n') + define(`VARIANT_1') + define(`LOGOPC',`xc') + define(`LOGOP',`xg')') +ifdef(`OPERATION_xnor_n',` + define(`func',`mpn_xnor_n') + define(`VARIANT_2') + define(`LOGOP',`xg')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) +ifdef(`VARIANT_1',` + cgr rp, up + jne L(normal) + + sllg n, n, 3 + aghi n, -1 + srlg %r1, n, 8 + ltgr %r1, %r1 C < 256 bytes to copy? + je L(1) + +L(tp): LOGOPC 0(256, rp), 0(vp) + la rp, 256(rp) + la vp, 256(vp) + brctg %r1, L(tp) + +L(1): bras %r1, L(2) C make r1 point to mvc insn + LOGOPC 0(1, rp), 0(vp) +L(2): ex n, 0(%r1) C execute mvc with length ((n-1) mod 256)+1 +L(rtn): br %r14 + + +L(normal): + stmg %r6, %r8, 48(%r15) + aghi n, 3 + lghi %r7, 3 + srlg %r0, n, 2 + ngr %r7, n C n mod 4 + je L(b1) + cghi %r7, 2 + jl L(b2) + jne L(top) + +L(b3): lmg %r5, %r7, 0(up) + la up, 24(up) + LOGOP %r5, 0(vp) + LOGOP %r6, 8(vp) + LOGOP %r7, 16(vp) + stmg %r5, %r7, 0(rp) + la rp, 24(rp) + la vp, 24(vp) + j L(mid) + +L(b1): lg %r5, 0(up) + la up, 8(up) + LOGOP %r5, 0(vp) + stg %r5, 0(rp) + la rp, 8(rp) + la vp, 8(vp) + j L(mid) + +L(b2): lmg %r5, %r6, 0(up) + la up, 16(up) + LOGOP %r5, 0(vp) + LOGOP %r6, 8(vp) + stmg %r5, %r6, 0(rp) + la rp, 16(rp) + la vp, 16(vp) + j L(mid) + +L(top): lmg %r5, %r8, 0(up) + la up, 32(up) + LOGOP %r5, 0(vp) + LOGOP %r6, 8(vp) + LOGOP %r7, 16(vp) + LOGOP %r8, 24(vp) + stmg %r5, %r8, 0(rp) + la rp, 32(rp) + la vp, 32(vp) +L(mid): brctg %r0, L(top) + + lmg %r6, %r8, 48(%r15) + br %r14 +') + +ifdef(`VARIANT_2',` + stmg %r6, %r8, 48(%r15) + lghi %r1, -1 + + aghi n, 3 + lghi %r7, 3 + srlg %r0, n, 2 + ngr %r7, n C n mod 4 + je L(b1) + cghi %r7, 2 + jl L(b2) + jne L(top) + +L(b3): lmg %r5, %r7, 0(vp) + la vp, 24(vp) + xgr %r5, %r1 + xgr %r6, %r1 + xgr %r7, %r1 + LOGOP %r5, 0(up) + LOGOP %r6, 8(up) + LOGOP %r7, 16(up) + stmg %r5, %r7, 0(rp) + la rp, 24(rp) + la up, 24(up) + j L(mid) + +L(b1): lg %r5, 0(vp) + la vp, 8(vp) + xgr %r5, %r1 + LOGOP %r5, 0(up) + stg %r5, 0(rp) + la rp, 8(rp) + la up, 8(up) + j L(mid) + +L(b2): lmg %r5, %r6, 0(vp) + la vp, 16(vp) + xgr %r5, %r1 + xgr %r6, %r1 + LOGOP %r5, 0(up) + LOGOP %r6, 8(up) + stmg %r5, %r6, 0(rp) + la rp, 16(rp) + la up, 16(up) + j L(mid) + +L(top): lmg %r5, %r8, 0(vp) + la vp, 32(vp) + xgr %r5, %r1 + xgr %r6, %r1 + xgr %r7, %r1 + xgr %r8, %r1 + LOGOP %r5, 0(up) + LOGOP %r6, 8(up) + LOGOP %r7, 16(up) + LOGOP %r8, 24(up) + la up, 32(up) + stmg %r5, %r8, 0(rp) + la rp, 32(rp) +L(mid): brctg %r0, L(top) + + lmg %r6, %r8, 48(%r15) + br %r14 +') + +ifdef(`VARIANT_3',` + stmg %r6, %r8, 48(%r15) + lghi %r1, -1 + + aghi n, 3 + lghi %r7, 3 + srlg %r0, n, 2 + ngr %r7, n C n mod 4 + je L(b1) + cghi %r7, 2 + jl L(b2) + jne L(top) + +L(b3): lmg %r5, %r7, 0(vp) + la vp, 24(vp) + LOGOP %r5, 0(up) + LOGOP %r6, 8(up) + xgr %r5, %r1 + xgr %r6, %r1 + LOGOP %r7, 16(up) + xgr %r7, %r1 + stmg %r5, %r7, 0(rp) + la rp, 24(rp) + la up, 24(up) + j L(mid) + +L(b1): lg %r5, 0(vp) + la vp, 8(vp) + LOGOP %r5, 0(up) + xgr %r5, %r1 + stg %r5, 0(rp) + la rp, 8(rp) + la up, 8(up) + j L(mid) + +L(b2): lmg %r5, %r6, 0(vp) + la vp, 16(vp) + LOGOP %r5, 0(up) + LOGOP %r6, 8(up) + xgr %r5, %r1 + xgr %r6, %r1 + stmg %r5, %r6, 0(rp) + la rp, 16(rp) + la up, 16(up) + j L(mid) + +L(top): lmg %r5, %r8, 0(vp) + la vp, 32(vp) + LOGOP %r5, 0(up) + LOGOP %r6, 8(up) + xgr %r5, %r1 + xgr %r6, %r1 + LOGOP %r7, 16(up) + LOGOP %r8, 24(up) + xgr %r7, %r1 + xgr %r8, %r1 + stmg %r5, %r8, 0(rp) + la up, 32(up) + la rp, 32(rp) +L(mid): brctg %r0, L(top) + + lmg %r6, %r8, 48(%r15) + br %r14 +') + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/lshift.asm b/gmp-6.3.0/mpn/s390_64/lshift.asm new file mode 100644 index 0000000..4dae035 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/lshift.asm @@ -0,0 +1,196 @@ +dnl S/390-64 mpn_lshift. + +dnl Copyright 2011, 2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 7 +C z990 3 +C z9 ? +C z10 6 +C z196 ? + +C NOTES +C * This uses discrete loads and stores in a software pipeline. Using lmg and +C stmg is not faster. +C * One could assume more pipelining could approach 2.5 c/l, but we have not +C found any 8-way loop that runs better than the current 4-way loop. +C * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4, +C similarly to the x86_64 sqr_basecase feed-in. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`cnt', `%r5') + +define(`tnc', `%r6') + +ASM_START() +PROLOGUE(mpn_lshift) + cghi n, 3 + jh L(gt1) + + stmg %r6, %r7, 48(%r15) + larl %r1, L(tab)-4 + lcgr tnc, cnt + sllg n, n, 2 + b 0(n,%r1) +L(tab): j L(n1) + j L(n2) + j L(n3) + +L(n1): lg %r1, 0(up) + sllg %r0, %r1, 0(cnt) + stg %r0, 0(rp) + srlg %r2, %r1, 0(tnc) + lg %r6, 48(%r15) C restoring r7 not needed + br %r14 + +L(n2): lg %r1, 8(up) + srlg %r4, %r1, 0(tnc) + sllg %r0, %r1, 0(cnt) + j L(cj) + +L(n3): lg %r1, 16(up) + srlg %r4, %r1, 0(tnc) + sllg %r0, %r1, 0(cnt) + lg %r1, 8(up) + srlg %r7, %r1, 0(tnc) + ogr %r7, %r0 + sllg %r0, %r1, 0(cnt) + stg %r7, 16(rp) +L(cj): lg %r1, 0(up) + srlg %r7, %r1, 0(tnc) + ogr %r7, %r0 + sllg %r0, %r1, 0(cnt) + stg %r7, 8(rp) + stg %r0, 0(rp) + lgr %r2, %r4 + lmg %r6, %r7, 48(%r15) + br %r14 + +L(gt1): stmg %r6, %r13, 48(%r15) + lcgr tnc, cnt C tnc = -cnt + + sllg %r1, n, 3 + srlg %r0, n, 2 C loop count + + agr up, %r1 C point up at end of U + agr rp, %r1 C point rp at end of R + aghi up, -56 + aghi rp, -40 + + lghi %r7, 3 + ngr %r7, n + je L(b0) + cghi %r7, 2 + jl L(b1) + je L(b2) + +L(b3): lg %r7, 48(up) + srlg %r9, %r7, 0(tnc) + sllg %r11, %r7, 0(cnt) + lg %r8, 40(up) + lg %r7, 32(up) + srlg %r4, %r8, 0(tnc) + sllg %r13, %r8, 0(cnt) + ogr %r11, %r4 + la rp, 16(rp) + j L(lm3) + +L(b2): lg %r8, 48(up) + lg %r7, 40(up) + srlg %r9, %r8, 0(tnc) + sllg %r13, %r8, 0(cnt) + la rp, 24(rp) + la up, 8(up) + j L(lm2) + +L(b1): lg %r7, 48(up) + srlg %r9, %r7, 0(tnc) + sllg %r11, %r7, 0(cnt) + lg %r8, 40(up) + lg %r7, 32(up) + srlg %r4, %r8, 0(tnc) + sllg %r10, %r8, 0(cnt) + ogr %r11, %r4 + la rp, 32(rp) + la up, 16(up) + j L(lm1) + +L(b0): lg %r8, 48(up) + lg %r7, 40(up) + srlg %r9, %r8, 0(tnc) + sllg %r10, %r8, 0(cnt) + la rp, 40(rp) + la up, 24(up) + j L(lm0) + + ALIGN(8) +L(top): srlg %r4, %r8, 0(tnc) + sllg %r13, %r8, 0(cnt) + ogr %r11, %r4 + stg %r10, 24(rp) +L(lm3): stg %r11, 16(rp) +L(lm2): srlg %r12, %r7, 0(tnc) + sllg %r11, %r7, 0(cnt) + lg %r8, 24(up) + lg %r7, 16(up) + ogr %r13, %r12 + srlg %r4, %r8, 0(tnc) + sllg %r10, %r8, 0(cnt) + ogr %r11, %r4 + stg %r13, 8(rp) +L(lm1): stg %r11, 0(rp) +L(lm0): srlg %r12, %r7, 0(tnc) + aghi rp, -32 + sllg %r11, %r7, 0(cnt) + lg %r8, 8(up) + lg %r7, 0(up) + aghi up, -32 + ogr %r10, %r12 + brctg %r0, L(top) + +L(end): srlg %r4, %r8, 0(tnc) + sllg %r13, %r8, 0(cnt) + ogr %r11, %r4 + stg %r10, 24(rp) + stg %r11, 16(rp) + srlg %r12, %r7, 0(tnc) + sllg %r11, %r7, 0(cnt) + ogr %r13, %r12 + stg %r13, 8(rp) + stg %r11, 0(rp) + lgr %r2, %r9 + + lmg %r6, %r13, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/lshiftc.asm b/gmp-6.3.0/mpn/s390_64/lshiftc.asm new file mode 100644 index 0000000..92552d5 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/lshiftc.asm @@ -0,0 +1,207 @@ +dnl S/390-64 mpn_lshiftc. + +dnl Copyright 2011, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 9 +C z990 3.5 +C z9 ? +C z10 7 +C z196 ? + +C NOTES +C * See notes in lshift.asm. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`cnt', `%r5') + +define(`tnc', `%r6') + +ASM_START() +PROLOGUE(mpn_lshiftc) + cghi n, 3 + jh L(gt1) + + stmg %r6, %r8, 48(%r15) + larl %r1, L(tab)-4 + lcgr tnc, cnt + sllg n, n, 2 + lghi %r8, -1 + b 0(n,%r1) +L(tab): j L(n1) + j L(n2) + j L(n3) + +L(n1): lg %r1, 0(up) + sllg %r0, %r1, 0(cnt) + xgr %r0, %r8 + stg %r0, 0(rp) + srlg %r2, %r1, 0(tnc) + lmg %r6, %r8, 48(%r15) + br %r14 + +L(n2): lg %r1, 8(up) + srlg %r4, %r1, 0(tnc) + sllg %r0, %r1, 0(cnt) + j L(cj) + +L(n3): lg %r1, 16(up) + srlg %r4, %r1, 0(tnc) + sllg %r0, %r1, 0(cnt) + lg %r1, 8(up) + srlg %r7, %r1, 0(tnc) + ogr %r7, %r0 + sllg %r0, %r1, 0(cnt) + xgr %r7, %r8 + stg %r7, 16(rp) +L(cj): lg %r1, 0(up) + srlg %r7, %r1, 0(tnc) + ogr %r7, %r0 + sllg %r0, %r1, 0(cnt) + xgr %r7, %r8 + xgr %r0, %r8 + stg %r7, 8(rp) + stg %r0, 0(rp) + lgr %r2, %r4 + lmg %r6, %r8, 48(%r15) + br %r14 + +L(gt1): stmg %r6, %r14, 48(%r15) + lcgr tnc, cnt C tnc = -cnt + + sllg %r1, n, 3 + srlg %r0, n, 2 C loop count + + agr up, %r1 C point up at end of U + agr rp, %r1 C point rp at end of R + aghi up, -56 + aghi rp, -40 + + lghi %r7, 3 + lghi %r14, -1 + ngr %r7, n + je L(b0) + cghi %r7, 2 + jl L(b1) + je L(b2) + +L(b3): lg %r7, 48(up) + srlg %r9, %r7, 0(tnc) + sllg %r11, %r7, 0(cnt) + lg %r8, 40(up) + lg %r7, 32(up) + srlg %r4, %r8, 0(tnc) + sllg %r13, %r8, 0(cnt) + ogr %r11, %r4 + la rp, 16(rp) + xgr %r11, %r14 + j L(lm3) + +L(b2): lg %r8, 48(up) + lg %r7, 40(up) + srlg %r9, %r8, 0(tnc) + sllg %r13, %r8, 0(cnt) + la rp, 24(rp) + la up, 8(up) + j L(lm2) + +L(b1): lg %r7, 48(up) + srlg %r9, %r7, 0(tnc) + sllg %r11, %r7, 0(cnt) + lg %r8, 40(up) + lg %r7, 32(up) + srlg %r4, %r8, 0(tnc) + sllg %r10, %r8, 0(cnt) + ogr %r11, %r4 + la rp, 32(rp) + la up, 16(up) + xgr %r11, %r14 + j L(lm1) + +L(b0): lg %r8, 48(up) + lg %r7, 40(up) + srlg %r9, %r8, 0(tnc) + sllg %r10, %r8, 0(cnt) + la rp, 40(rp) + la up, 24(up) + j L(lm0) + + ALIGN(8) +L(top): srlg %r4, %r8, 0(tnc) + sllg %r13, %r8, 0(cnt) + ogr %r11, %r4 + xgr %r10, %r14 + xgr %r11, %r14 + stg %r10, 24(rp) +L(lm3): stg %r11, 16(rp) +L(lm2): srlg %r12, %r7, 0(tnc) + sllg %r11, %r7, 0(cnt) + lg %r8, 24(up) + lg %r7, 16(up) + ogr %r13, %r12 + srlg %r4, %r8, 0(tnc) + sllg %r10, %r8, 0(cnt) + ogr %r11, %r4 + xgr %r13, %r14 + xgr %r11, %r14 + stg %r13, 8(rp) +L(lm1): stg %r11, 0(rp) +L(lm0): srlg %r12, %r7, 0(tnc) + aghi rp, -32 + sllg %r11, %r7, 0(cnt) + lg %r8, 8(up) + lg %r7, 0(up) + aghi up, -32 + ogr %r10, %r12 + brctg %r0, L(top) + +L(end): srlg %r4, %r8, 0(tnc) + sllg %r13, %r8, 0(cnt) + ogr %r11, %r4 + xgr %r10, %r14 + xgr %r11, %r14 + stg %r10, 24(rp) + stg %r11, 16(rp) + srlg %r12, %r7, 0(tnc) + sllg %r11, %r7, 0(cnt) + ogr %r13, %r12 + xgr %r13, %r14 + xgr %r11, %r14 + stg %r13, 8(rp) + stg %r11, 0(rp) + lgr %r2, %r9 + + lmg %r6, %r14, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/mod_34lsub1.asm b/gmp-6.3.0/mpn/s390_64/mod_34lsub1.asm new file mode 100644 index 0000000..fd40011 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/mod_34lsub1.asm @@ -0,0 +1,109 @@ +dnl S/390-64 mpn_mod_34lsub1 + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 5.8 +C z990 2 +C z9 ? +C z10 4.5 +C z196 ? + +C TODO +C * Optimise summation code, see x86_64. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`n', `%r3') + +ASM_START() +PROLOGUE(mpn_mod_34lsub1) + stmg %r7, %r12, 56(%r15) + lghi %r11, 0 + lghi %r12, 0 + lghi %r0, 0 + lghi %r8, 0 + lghi %r9, 0 + lghi %r10, 0 + lghi %r7, 0 + aghi %r3, -3 + jl .L3 + +L(top): alg %r0, 0(%r2) + alcg %r12, 8(%r2) + alcg %r11, 16(%r2) + alcgr %r8, %r7 + la %r2, 24(%r2) + aghi %r3, -3 + jnl L(top) + + lgr %r7, %r8 + srlg %r1, %r11, 16 + nihh %r7, 0 C 0xffffffffffff + agr %r7, %r1 + srlg %r8, %r8, 48 + agr %r7, %r8 + sllg %r11, %r11, 32 + nihh %r11, 0 + agr %r7, %r11 +.L3: + cghi %r3, -3 + je .L6 + alg %r0, 0(%r2) + alcgr %r10, %r10 + cghi %r3, -2 + je .L6 + alg %r12, 8(%r2) + alcgr %r9, %r9 +.L6: + srlg %r1, %r0, 48 + nihh %r0, 0 C 0xffffffffffff + agr %r0, %r1 + agr %r0, %r7 + srlg %r1, %r12, 32 + agr %r0, %r1 + srlg %r1, %r10, 32 + agr %r0, %r1 + llgfr %r12, %r12 + srlg %r1, %r9, 16 + sllg %r12, %r12, 16 + llgfr %r10, %r10 + agr %r0, %r1 + llill %r2, 65535 + agr %r0, %r12 + sllg %r10, %r10, 16 + ngr %r2, %r9 + agr %r0, %r10 + sllg %r2, %r2, 32 + agr %r2, %r0 + lmg %r7, %r12, 56(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/mul_1.asm b/gmp-6.3.0/mpn/s390_64/mul_1.asm new file mode 100644 index 0000000..a8f6da9 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/mul_1.asm @@ -0,0 +1,66 @@ +dnl S/390-64 mpn_mul_1 + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 29 +C z990 22 +C z9 ? +C z10 20 +C z196 ? + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`v0', `%r5') + +ASM_START() +PROLOGUE(mpn_mul_1) + stmg %r11, %r12, 88(%r15) + lghi %r12, 0 C zero index register + aghi %r12, 0 C clear carry flag + lghi %r11, 0 C clear carry limb + +L(top): lg %r1, 0(%r12,up) + mlgr %r0, v0 + alcgr %r1, %r11 + lgr %r11, %r0 C copy high part to carry limb + stg %r1, 0(%r12,rp) + la %r12, 8(%r12) + brctg n, L(top) + + lghi %r2, 0 + alcgr %r2, %r11 + + lmg %r11, %r12, 88(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/mul_basecase.asm b/gmp-6.3.0/mpn/s390_64/mul_basecase.asm new file mode 100644 index 0000000..7d14ea9 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/mul_basecase.asm @@ -0,0 +1,130 @@ +dnl S/390-64 mpn_mul_basecase. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 ? +C z990 23 +C z9 ? +C z10 28 +C z196 ? + +C TODO +C * Perhaps add special case for un <= 2. +C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped +C up by about 10%. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`un', `%r4') +define(`vp', `%r5') +define(`vn', `%r6') + +define(`zero', `%r8') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + cghi un, 2 + jhe L(ge2) + +C un = vn = 1 + lg %r1, 0(vp) + mlg %r0, 0(up) + stg %r1, 0(rp) + stg %r0, 8(rp) + br %r14 + +L(ge2): C jne L(gen) + + +L(gen): +C mul_1 ======================================================================= + + stmg %r6, %r12, 48(%r15) + lghi zero, 0 + aghi un, -1 + + lg %r7, 0(vp) + lg %r11, 0(up) + lghi %r12, 8 C init index register + mlgr %r10, %r7 + lgr %r9, un + stg %r11, 0(rp) + cr %r15, %r15 C clear carry flag + +L(tm): lg %r1, 0(%r12,up) + mlgr %r0, %r7 + alcgr %r1, %r10 + lgr %r10, %r0 C copy high part to carry limb + stg %r1, 0(%r12,rp) + la %r12, 8(%r12) + brctg %r9, L(tm) + + alcgr %r0, zero + stg %r0, 0(%r12,rp) + +C addmul_1 loop =============================================================== + + aghi vn, -1 + je L(outer_end) +L(outer_loop): + + la rp, 8(rp) C rp += 1 + la vp, 8(vp) C up += 1 + lg %r7, 0(vp) + lg %r11, 0(up) + lghi %r12, 8 C init index register + mlgr %r10, %r7 + lgr %r9, un + alg %r11, 0(rp) + stg %r11, 0(rp) + +L(tam): lg %r1, 0(%r12,up) + lg %r11, 0(%r12,rp) + mlgr %r0, %r7 + alcgr %r1, %r11 + alcgr %r0, zero + algr %r1, %r10 + lgr %r10, %r0 + stg %r1, 0(%r12,rp) + la %r12, 8(%r12) + brctg %r9, L(tam) + + alcgr %r0, zero + stg %r0, 0(%r12,rp) + + brctg vn, L(outer_loop) +L(outer_end): + + lmg %r6, %r12, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/rshift.asm b/gmp-6.3.0/mpn/s390_64/rshift.asm new file mode 100644 index 0000000..e870971 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/rshift.asm @@ -0,0 +1,195 @@ +dnl S/390-64 mpn_rshift. + +dnl Copyright 2011, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 7 +C z990 3 +C z9 ? +C z10 6 +C z196 ? + +C NOTES +C * See notes in lshift.asm. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`cnt', `%r5') + +define(`tnc', `%r6') + +ASM_START() +PROLOGUE(mpn_rshift) + cghi n, 3 + jh L(gt1) + + stmg %r6, %r7, 48(%r15) + larl %r1, L(tab)-4 + lcgr tnc, cnt + sllg n, n, 2 + b 0(n,%r1) +L(tab): j L(n1) + j L(n2) + j L(n3) + +L(n1): lg %r1, 0(up) + srlg %r0, %r1, 0(cnt) + stg %r0, 0(rp) + sllg %r2, %r1, 0(tnc) + lg %r6, 48(%r15) C restoring r7 not needed + br %r14 + +L(n2): lg %r1, 0(up) + sllg %r4, %r1, 0(tnc) + srlg %r0, %r1, 0(cnt) + lg %r1, 8(up) + sllg %r7, %r1, 0(tnc) + ogr %r7, %r0 + srlg %r0, %r1, 0(cnt) + stg %r7, 0(rp) + stg %r0, 8(rp) + lgr %r2, %r4 + lmg %r6, %r7, 48(%r15) + br %r14 + + +L(n3): lg %r1, 0(up) + sllg %r4, %r1, 0(tnc) + srlg %r0, %r1, 0(cnt) + lg %r1, 8(up) + sllg %r7, %r1, 0(tnc) + ogr %r7, %r0 + srlg %r0, %r1, 0(cnt) + stg %r7, 0(rp) + lg %r1, 16(up) + sllg %r7, %r1, 0(tnc) + ogr %r7, %r0 + srlg %r0, %r1, 0(cnt) + stg %r7, 8(rp) + stg %r0, 16(rp) + lgr %r2, %r4 + lmg %r6, %r7, 48(%r15) + br %r14 + +L(gt1): stmg %r6, %r13, 48(%r15) + lcgr tnc, cnt C tnc = -cnt + + sllg %r1, n, 3 + srlg %r0, n, 2 C loop count + + lghi %r7, 3 + ngr %r7, n + je L(b0) + cghi %r7, 2 + jl L(b1) + je L(b2) + +L(b3): aghi rp, -8 + lg %r7, 0(up) + sllg %r9, %r7, 0(tnc) + srlg %r11, %r7, 0(cnt) + lg %r8, 8(up) + lg %r7, 16(up) + sllg %r4, %r8, 0(tnc) + srlg %r13, %r8, 0(cnt) + ogr %r11, %r4 + la up, 24(up) + j L(lm3) + +L(b2): aghi rp, -16 + lg %r8, 0(up) + lg %r7, 8(up) + sllg %r9, %r8, 0(tnc) + srlg %r13, %r8, 0(cnt) + la up, 16(up) + j L(lm2) + +L(b1): aghi rp, -24 + lg %r7, 0(up) + sllg %r9, %r7, 0(tnc) + srlg %r11, %r7, 0(cnt) + lg %r8, 8(up) + lg %r7, 16(up) + sllg %r4, %r8, 0(tnc) + srlg %r10, %r8, 0(cnt) + ogr %r11, %r4 + la up, 8(up) + j L(lm1) + +L(b0): aghi rp, -32 + lg %r8, 0(up) + lg %r7, 8(up) + sllg %r9, %r8, 0(tnc) + srlg %r10, %r8, 0(cnt) + j L(lm0) + + ALIGN(8) +L(top): sllg %r4, %r8, 0(tnc) + srlg %r13, %r8, 0(cnt) + ogr %r11, %r4 + stg %r10, 0(rp) +L(lm3): stg %r11, 8(rp) +L(lm2): sllg %r12, %r7, 0(tnc) + srlg %r11, %r7, 0(cnt) + lg %r8, 0(up) + lg %r7, 8(up) + ogr %r13, %r12 + sllg %r4, %r8, 0(tnc) + srlg %r10, %r8, 0(cnt) + ogr %r11, %r4 + stg %r13, 16(rp) +L(lm1): stg %r11, 24(rp) +L(lm0): sllg %r12, %r7, 0(tnc) + aghi rp, 32 + srlg %r11, %r7, 0(cnt) + lg %r8, 16(up) + lg %r7, 24(up) + aghi up, 32 + ogr %r10, %r12 + brctg %r0, L(top) + +L(end): sllg %r4, %r8, 0(tnc) + srlg %r13, %r8, 0(cnt) + ogr %r11, %r4 + stg %r10, 0(rp) + stg %r11, 8(rp) + sllg %r12, %r7, 0(tnc) + srlg %r11, %r7, 0(cnt) + ogr %r13, %r12 + stg %r13, 16(rp) + stg %r11, 24(rp) + lgr %r2, %r9 + + lmg %r6, %r13, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/sec_tabselect.asm b/gmp-6.3.0/mpn/s390_64/sec_tabselect.asm new file mode 100644 index 0000000..2c97423 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/sec_tabselect.asm @@ -0,0 +1,139 @@ +dnl S/390-64 mpn_sec_tabselect + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 ? +C z990 ? +C z9 ? +C z10 ? +C z196 ? +C z13 ? +C z14 ? +C z15 1.6 + +dnl void +dnl mpn_sec_tabselect (volatile mp_limb_t *rp, volatile const mp_limb_t *tab, +dnl mp_size_t n, mp_size_t nents, mp_size_t which) + +define(`rp', `%r2') +define(`tp', `%r3') +define(`n', `%r4') +define(`nents', `%r5') +define(`which_arg',`%r6') C magicked to stack + +dnl r0 r1 r2 r3 r4 r5 r6 r7 +dnl r8 r9 r10 r11 r12 r13 r14 r15 + +define(`mask', `%r14') +define(`k', `%r1') +define(`which', `%r0') + +define(`FRAME', 64) + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + stmg %r5, %r15, 40(%r15) + aghi %r15, -FRAME + + sllg n, n, 3 + msgr %r5, n + stg %r5, 16(%r15) C nents * n * LIMB_BYTES + + srlg %r5, n, 2+3 + ngr %r5, %r5 + je L(end4) +L(outer): + lg which, eval(48+FRAME)(%r15) + lg k, eval(40+FRAME)(%r15) C nents + lghi %r6, 0 + lghi %r7, 0 + lghi %r8, 0 + lghi %r9, 0 +L(tp4): lghi mask, 1 + slgr which, mask + slbgr mask, mask + lmg %r10, %r13, 0(tp) + ngr %r10, mask + ngr %r11, mask + ngr %r12, mask + ngr %r13, mask + agr %r6, %r10 + agr %r7, %r11 + agr %r8, %r12 + agr %r9, %r13 + agr tp, n + brctg k, L(tp4) + stmg %r6, %r9, 0(rp) + aghi rp, 32 + slg tp, 16(%r15) + aghi tp, eval(4*8) + brctg %r5, L(outer) +L(end4): + tmll n, 16 + je L(end2) + lg which, eval(48+FRAME)(%r15) + lg k, eval(40+FRAME)(%r15) C nents + lghi %r6, 0 + lghi %r7, 0 +L(tp2): lghi mask, 1 + slgr which, mask + slbgr mask, mask + lmg %r10, %r11, 0(tp) + ngr %r10, mask + ngr %r11, mask + agr %r6, %r10 + agr %r7, %r11 + agr tp, n + brctg k, L(tp2) + stmg %r6, %r7, 0(rp) + aghi rp, 16 + slg tp, 16(%r15) + aghi tp, eval(2*8) +L(end2): + tmll n, 8 + je L(end1) + lg which, eval(48+FRAME)(%r15) + lg k, eval(40+FRAME)(%r15) C nents + lghi %r6, 0 +L(tp1): lghi mask, 1 + slgr which, mask + slbgr mask, mask + lg %r10, 0(tp) + ngr %r10, mask + agr %r6, %r10 + agr tp, n + brctg k, L(tp1) + stg %r6, 0(rp) +L(end1): + lmg %r5, %r15, eval(40+FRAME)(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/sqr_basecase.asm b/gmp-6.3.0/mpn/s390_64/sqr_basecase.asm new file mode 100644 index 0000000..bf31bd5 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/sqr_basecase.asm @@ -0,0 +1,203 @@ +dnl S/390-64 mpn_sqr_basecase. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 ? +C z990 23 +C z9 ? +C z10 28 +C z196 ? + +C TODO +C * Clean up. +C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail. +C This will ask for basecase handling of n = 3. +C * Update counters and pointers more straightforwardly, possibly lowering +C register usage. +C * Should we use this allocation-free style for more sqr_basecase asm +C implementations? The only disadvantage is that it requires R != U. +C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped +C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even +C more. + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') + +define(`zero', `%r8') +define(`rp_saved', `%r9') +define(`up_saved', `%r13') +define(`n_saved', `%r14') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + aghi n, -2 + jhe L(ge2) + +C n = 1 + lg %r5, 0(up) + mlgr %r4, %r5 + stg %r5, 0(rp) + stg %r4, 8(rp) + br %r14 + +L(ge2): jne L(gen) + +C n = 2 + stmg %r6, %r8, 48(%r15) + lghi zero, 0 + + lg %r5, 0(up) + mlgr %r4, %r5 C u0 * u0 + lg %r1, 8(up) + mlgr %r0, %r1 C u1 * u1 + stg %r5, 0(rp) + + lg %r7, 0(up) + mlg %r6, 8(up) C u0 * u1 + algr %r7, %r7 + alcgr %r6, %r6 + alcgr %r0, zero + + algr %r4, %r7 + alcgr %r1, %r6 + alcgr %r0, zero + stg %r4, 8(rp) + stg %r1, 16(rp) + stg %r0, 24(rp) + + lmg %r6, %r8, 48(%r15) + br %r14 + +L(gen): +C mul_1 ======================================================================= + + stmg %r6, %r14, 48(%r15) + lghi zero, 0 + lgr up_saved, up + lgr rp_saved, rp + lgr n_saved, n + + lg %r6, 0(up) + lg %r11, 8(up) + lghi %r12, 16 C init index register + mlgr %r10, %r6 + lgr %r5, n + stg %r11, 8(rp) + cr %r15, %r15 C clear carry flag + +L(tm): lg %r1, 0(%r12,up) + mlgr %r0, %r6 + alcgr %r1, %r10 + lgr %r10, %r0 C copy high part to carry limb + stg %r1, 0(%r12,rp) + la %r12, 8(%r12) + brctg %r5, L(tm) + + alcgr %r0, zero + stg %r0, 0(%r12,rp) + +C addmul_1 loop =============================================================== + + aghi n, -1 + je L(outer_end) +L(outer_loop): + + la rp, 16(rp) C rp += 2 + la up, 8(up) C up += 1 + lg %r6, 0(up) + lg %r11, 8(up) + lghi %r12, 16 C init index register + mlgr %r10, %r6 + lgr %r5, n + alg %r11, 8(rp) + stg %r11, 8(rp) + +L(tam): lg %r1, 0(%r12,up) + lg %r7, 0(%r12,rp) + mlgr %r0, %r6 + alcgr %r1, %r7 + alcgr %r0, zero + algr %r1, %r10 + lgr %r10, %r0 + stg %r1, 0(%r12,rp) + la %r12, 8(%r12) + brctg %r5, L(tam) + + alcgr %r0, zero + stg %r0, 0(%r12,rp) + + brctg n, L(outer_loop) +L(outer_end): + + lg %r6, 8(up) + lg %r1, 16(up) + lgr %r7, %r0 C Same as: lg %r7, 24(,rp) + mlgr %r0, %r6 + algr %r1, %r7 + alcgr %r0, zero + stg %r1, 24(rp) + stg %r0, 32(rp) + +C sqr_diag_addlsh1 ============================================================ + +define(`up', `up_saved') +define(`rp', `rp_saved') + la n, 1(n_saved) + + lg %r1, 0(up) + mlgr %r0, %r1 + stg %r1, 0(rp) +C clr %r15, %r15 C clear carry (already clear per above) + +L(top): lg %r11, 8(up) + la up, 8(up) + lg %r6, 8(rp) + lg %r7, 16(rp) + mlgr %r10, %r11 + alcgr %r6, %r6 + alcgr %r7, %r7 + alcgr %r10, zero C propagate carry to high product limb + algr %r6, %r0 + alcgr %r7, %r11 + stmg %r6, %r7, 8(rp) + la rp, 16(rp) + lgr %r0, %r10 C copy carry limb + brctg n, L(top) + + alcgr %r0, zero + stg %r0, 8(rp) + + lmg %r6, %r14, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/sublsh1_n.asm b/gmp-6.3.0/mpn/s390_64/sublsh1_n.asm new file mode 100644 index 0000000..50f127a --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/sublsh1_n.asm @@ -0,0 +1,169 @@ +dnl S/390-64 mpn_sublsh1_n + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 10 +C z990 5 +C z9 ? +C z10 12 +C z196 ? + +C TODO +C * Optimise for small n +C * Compute RETVAL for sublsh1_n less stupidly + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`vp', `%r4') +define(`n', `%r5') + +ifdef(`OPERATION_addlsh1_n',` + define(ADSBR, algr) + define(ADSBCR, alcgr) + define(INITCY, `lghi %r13, -1') + define(RETVAL, `la %r2, 2(%r1,%r13)') + define(func, mpn_addlsh1_n) +') +ifdef(`OPERATION_sublsh1_n',` + define(ADSBR, slgr) + define(ADSBCR, slbgr) + define(INITCY, `lghi %r13, 0') + define(RETVAL,`dnl + slgr %r1, %r13 + lghi %r2, 1 + algr %r2, %r1') + define(func, mpn_sublsh1_n) +') + +ASM_START() +PROLOGUE(mpn_sublsh1_n) + stmg %r6, %r13, 48(%r15) + + aghi n, 3 + lghi %r7, 3 + srlg %r0, n, 2 + ngr %r7, n C n mod 4 + je L(b1) + cghi %r7, 2 + jl L(b2) + jne L(b0) + +L(b3): lmg %r5, %r7, 0(up) + la up, 24(up) + lmg %r9, %r11, 0(vp) + la vp, 24(vp) + + algr %r9, %r9 + alcgr %r10, %r10 + alcgr %r11, %r11 + slbgr %r1, %r1 + + ADSBR %r5, %r9 + ADSBCR %r6, %r10 + ADSBCR %r7, %r11 + slbgr %r13, %r13 + + stmg %r5, %r7, 0(rp) + la rp, 24(rp) + brctg %r0, L(top) + j L(end) + +L(b0): lghi %r1, -1 + INITCY + j L(top) + +L(b1): lg %r5, 0(up) + la up, 8(up) + lg %r9, 0(vp) + la vp, 8(vp) + + algr %r9, %r9 + slbgr %r1, %r1 + ADSBR %r5, %r9 + slbgr %r13, %r13 + + stg %r5, 0(rp) + la rp, 8(rp) + brctg %r0, L(top) + j L(end) + +L(b2): lmg %r5, %r6, 0(up) + la up, 16(up) + lmg %r9, %r10, 0(vp) + la vp, 16(vp) + + algr %r9, %r9 + alcgr %r10, %r10 + slbgr %r1, %r1 + + ADSBR %r5, %r9 + ADSBCR %r6, %r10 + slbgr %r13, %r13 + + stmg %r5, %r6, 0(rp) + la rp, 16(rp) + brctg %r0, L(top) + j L(end) + +L(top): lmg %r9, %r12, 0(vp) + la vp, 32(vp) + + aghi %r1, 1 C restore carry + + alcgr %r9, %r9 + alcgr %r10, %r10 + alcgr %r11, %r11 + alcgr %r12, %r12 + + slbgr %r1, %r1 C save carry + + lmg %r5, %r8, 0(up) + la up, 32(up) + + aghi %r13, 1 C restore carry + + ADSBCR %r5, %r9 + ADSBCR %r6, %r10 + ADSBCR %r7, %r11 + ADSBCR %r8, %r12 + + slbgr %r13, %r13 C save carry + + stmg %r5, %r8, 0(rp) + la rp, 32(rp) + brctg %r0, L(top) + +L(end): RETVAL + lmg %r6, %r13, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/submul_1.asm b/gmp-6.3.0/mpn/s390_64/submul_1.asm new file mode 100644 index 0000000..3bb8b05 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/submul_1.asm @@ -0,0 +1,70 @@ +dnl S/390-64 mpn_submul_1 + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 35 +C z990 24 +C z9 ? +C z10 28 +C z196 ? + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`up', `%r3') +define(`n', `%r4') +define(`v0', `%r5') + +ASM_START() +PROLOGUE(mpn_submul_1) + stmg %r9, %r12, 72(%r15) + lghi %r12, 0 + slgr %r11, %r11 + +L(top): lg %r1, 0(%r12,up) + lg %r10, 0(%r12,rp) + mlgr %r0, v0 + slbgr %r10, %r1 + slbgr %r9, %r9 + slgr %r0, %r9 C conditional incr + slgr %r10, %r11 + lgr %r11, %r0 + stg %r10, 0(%r12,rp) + la %r12, 8(%r12) + brctg %r4, L(top) + + lgr %r2, %r11 + slbgr %r9, %r9 + slgr %r2, %r9 + + lmg %r9, %r12, 72(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/z10/gmp-mparam.h b/gmp-6.3.0/mpn/s390_64/z10/gmp-mparam.h new file mode 100644 index 0000000..c3a9416 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z10/gmp-mparam.h @@ -0,0 +1,233 @@ +/* S/390-64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2011, 2014, 2015 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 4400 MHz IBM z10 */ +/* FFT tuning limit = 30 M */ +/* Generated by tuneup.c, 2015-10-09, gcc 4.8 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 17 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 24 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 48 + +#define MUL_TOOM22_THRESHOLD 9 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 94 +#define MUL_TOOM6H_THRESHOLD 129 +#define MUL_TOOM8H_THRESHOLD 187 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 61 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 62 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 64 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 85 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 11 +#define SQR_TOOM3_THRESHOLD 80 +#define SQR_TOOM4_THRESHOLD 118 +#define SQR_TOOM6_THRESHOLD 189 +#define SQR_TOOM8_THRESHOLD 236 + +#define MULMID_TOOM42_THRESHOLD 24 + +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 252 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 252, 5}, { 9, 6}, { 5, 5}, { 11, 6}, \ + { 6, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \ + { 7, 7}, { 15, 8}, { 9, 7}, { 19, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 15, 7}, { 31, 8}, { 19, 9}, { 11, 8}, \ + { 27,10}, { 7, 9}, { 15, 8}, { 31, 9}, \ + { 19, 8}, { 41, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287, 7}, \ + { 575, 6}, { 1151,10}, { 79,11}, { 47,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 143,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 767, 9}, \ + { 415, 8}, { 831, 7}, { 1663,10}, { 239, 9}, \ + { 479,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575, 8}, \ + { 1151,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351, 9}, { 703, 8}, { 1407, 7}, { 2815,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 415,11}, \ + { 223,10}, { 447, 9}, { 895,13}, { 63,11}, \ + { 255,10}, { 575, 9}, { 1151,12}, { 159,11}, \ + { 319,10}, { 639, 9}, { 1279,10}, { 703, 9}, \ + { 1407,12}, { 191,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 511,12}, { 287,10}, \ + { 1151,12}, { 319,11}, { 703,10}, { 1407, 9}, \ + { 2815,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 447,11}, { 895,10}, \ + { 1791, 9}, { 3583,12}, { 479,11}, { 959,10}, \ + { 1919, 9}, { 3839,12}, { 511, 9}, { 4095, 6}, \ + { 32767, 8}, { 8447,11}, { 1151,13}, { 319,12}, \ + { 639,10}, { 2559,12}, { 703,10}, { 2815,12}, \ + { 831,11}, { 1663,12}, { 895,11}, { 1791,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,11}, \ + { 2047,12}, { 1215,10}, { 4863,11}, { 2559,14}, \ + { 383,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,11}, { 3583,15}, { 255,14}, \ + { 511,13}, { 1151,14}, { 639,13}, { 1279,12}, \ + { 2559,13}, { 1407,12}, { 2815,14}, { 767,13}, \ + { 1663,10}, { 13311,14}, { 895,13}, { 1791,12}, \ + { 3583,13}, { 1919,12}, { 3839,10}, { 15359,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2559,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1791,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 200 +#define MUL_FFT_THRESHOLD 1728 + +#define SQR_FFT_MODF_THRESHOLD 212 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 212, 5}, { 7, 4}, { 15, 5}, { 9, 4}, \ + { 19, 6}, { 5, 5}, { 11, 6}, { 6, 5}, \ + { 13, 6}, { 7, 5}, { 15, 6}, { 9, 5}, \ + { 19, 6}, { 13, 7}, { 7, 6}, { 15, 7}, \ + { 9, 6}, { 19, 7}, { 13, 8}, { 7, 7}, \ + { 16, 8}, { 9, 7}, { 19, 8}, { 11, 7}, \ + { 23, 8}, { 13, 9}, { 7, 8}, { 19, 9}, \ + { 11, 8}, { 25,10}, { 7, 9}, { 15, 8}, \ + { 31, 9}, { 23,10}, { 15, 9}, { 39,10}, \ + { 23,11}, { 15,10}, { 31, 9}, { 63,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287, 7}, \ + { 575,11}, { 47,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575, 7}, { 1151,11}, { 79,10}, \ + { 159, 9}, { 319,10}, { 175, 9}, { 351, 8}, \ + { 703, 7}, { 1407,10}, { 191, 9}, { 383,10}, \ + { 207,11}, { 111,10}, { 223,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,11}, { 143,10}, \ + { 287, 9}, { 575, 8}, { 1151,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351, 9}, \ + { 703, 8}, { 1407,11}, { 191,10}, { 383,11}, \ + { 207,10}, { 415,11}, { 223,10}, { 447,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 287,10}, { 575, 9}, { 1151,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \ + { 1407,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,10}, { 895, 9}, \ + { 1791,13}, { 127,12}, { 255,11}, { 511,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,11}, { 703,10}, \ + { 1407,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \ + { 895,10}, { 1791,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,10}, { 2047,11}, { 1151,12}, \ + { 607,13}, { 319,11}, { 1279, 9}, { 5119, 8}, \ + { 10751, 4}, { 172031, 7}, { 22015,11}, { 1407,10}, \ + { 2943, 8}, { 11775, 9}, { 6143,12}, { 831, 8}, \ + { 13311,11}, { 1791,14}, { 255,11}, { 2047,13}, \ + { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \ + { 703,12}, { 1407,11}, { 2815,12}, { 1471, 9}, \ + { 11775,13}, { 767,12}, { 1535,13}, { 831,12}, \ + { 1663,13}, { 895,11}, { 3583,13}, { 959,12}, \ + { 1919,10}, { 7679, 9}, { 15359,11}, { 3967,14}, \ + { 511,13}, { 1151,12}, { 2303,13}, { 1215,14}, \ + { 639,13}, { 1279,12}, { 2559,14}, { 767,13}, \ + { 1663,14}, { 895,15}, { 511,13}, { 2047,14}, \ + { 1279,13}, { 2815,15}, { 767,14}, { 1791,13}, \ + { 3583,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 201 +#define SQR_FFT_THRESHOLD 1344 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 2586 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 63 +#define SQRLO_SQR_THRESHOLD 2663 + +#define DC_DIV_QR_THRESHOLD 37 +#define DC_DIVAPPR_Q_THRESHOLD 143 +#define DC_BDIV_QR_THRESHOLD 37 +#define DC_BDIV_Q_THRESHOLD 86 + +#define INV_MULMOD_BNM1_THRESHOLD 16 +#define INV_NEWTON_THRESHOLD 147 +#define INV_APPR_THRESHOLD 141 + +#define BINV_NEWTON_THRESHOLD 141 +#define REDC_1_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 807 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 81 +#define MU_BDIV_QR_THRESHOLD 654 +#define MU_BDIV_Q_THRESHOLD 792 + +#define POWM_SEC_TABLE 1,28,163,1083,2111 + +#define GET_STR_DC_THRESHOLD 19 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_DC_THRESHOLD 898 +#define SET_STR_PRECOMPUTE_THRESHOLD 2031 + +#define FAC_DSC_THRESHOLD 372 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD_THRESHOLD 105 +#define HGCD_APPR_THRESHOLD 111 +#define HGCD_REDUCE_THRESHOLD 1137 +#define GCD_DC_THRESHOLD 285 +#define GCDEXT_DC_THRESHOLD 210 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/s390_64/z13/addmul_1.asm b/gmp-6.3.0/mpn/s390_64/z13/addmul_1.asm new file mode 100644 index 0000000..2b00612 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/addmul_1.asm @@ -0,0 +1,173 @@ +dnl S/390-64 mpn_addmul_1 and mpn_addmul_1c. +dnl Based on C code contributed by Marius Hillenbrand. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl TODO +dnl * Schedule vlvgp away from mlgr; that saves 20% of the run time. +dnl * Perhaps use vp[0]/vp[1] in innerloop instead preloading v0/v1. + +C cycles/limb +C z900 - +C z990 - +C z9 - +C z10 - +C z196 - +C z12 ? +C z13 ? +C z14 ? +C z15 2.55 + + +define(`rp', `%r2') +define(`ap', `%r3') +define(`an', `%r4') +define(`b0', `%r5') +define(`cy', `%r6') + +define(`idx', `%r4') + +ASM_START() + +PROLOGUE(mpn_addmul_1c) + stmg %r6, %r13, 48(%r15) + j L(ent) +EPILOGUE() + +PROLOGUE(mpn_addmul_1) + stmg %r6, %r13, 48(%r15) + lghi %r6, 0 +L(ent): vzero %v0 + vzero %v2 + srlg %r11, an, 2 + + tmll an, 1 + je L(bx0) +L(bx1): tmll an, 2 + jne L(b11) + +L(b01): lghi idx, -24 + vleg %v2, 0(rp), 1 + lg %r13, 0(ap) + vzero %v4 + mlgr %r12, b0 + algr %r13, %r6 + lghi %r6, 0 + alcgr %r12, %r6 + vlvgg %v4, %r13, 1 + vaq %v2, %v2, %v4 + vsteg %v2, 0(rp), 1 + vmrhg %v2, %v2, %v2 + cgije %r11, 0, L(1) + j L(cj0) + +L(b11): lghi idx, -8 + vleg %v2, 0(rp), 1 + lg %r9, 0(ap) + vzero %v4 + mlgr %r8, b0 + algr %r9, %r6 + lghi %r6, 0 + alcgr %r8, %r6 + vlvgg %v4, %r9, 1 + vaq %v2, %v2, %v4 + vsteg %v2, 0(rp), 1 + vmrhg %v2, %v2, %v2 + j L(cj1) + +L(bx0): tmll an, 2 + jne L(b10) +L(b00): lghi idx, -32 + lgr %r12, %r6 +L(cj0): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 + j L(mid) + +L(b10): lghi idx, -16 + lgr %r8, %r6 +L(cj1): lg %r7, 16(idx, ap) + lg %r13, 24(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + cgije %r11, 0, L(end) + +L(top): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vl %v1, 16(idx, rp), 3 + vpdi %v1, %v1, %v1, 4 + vacq %v5, %v6, %v1, %v0 + vacccq %v0, %v6, %v1, %v0 + vacq %v3, %v5, %v7, %v2 + vacccq %v2, %v5, %v7, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 +L(mid): lg %r7, 48(idx, ap) + lg %r13, 56(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vl %v4, 32(idx, rp), 3 + vpdi %v4, %v4, %v4, 4 + vacq %v5, %v6, %v4, %v0 + vacccq %v0, %v6, %v4, %v0 + vacq %v1, %v5, %v7, %v2 + vacccq %v2, %v5, %v7, %v2 + vpdi %v1, %v1, %v1, 4 + vst %v1, 32(idx, rp), 3 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + la idx, 32(idx) + brctg %r11, L(top) + +L(end): vl %v1, 16(idx, rp), 3 + vpdi %v1, %v1, %v1, 4 + vacq %v5, %v6, %v1, %v0 + vacccq %v0, %v6, %v1, %v0 + vacq %v3, %v5, %v7, %v2 + vacccq %v2, %v5, %v7, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + + vag %v2, %v0, %v2 +L(1): vlgvg %r2, %v2, 1 + algr %r2, %r12 + lmg %r6, %r13, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/z13/addmul_1.c b/gmp-6.3.0/mpn/s390_64/z13/addmul_1.c new file mode 100644 index 0000000..022e5ed --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/addmul_1.c @@ -0,0 +1,358 @@ +/* Addmul_1 / mul_1 for IBM z13 and later + Contributed by Marius Hillenbrand + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "s390_64/z13/common-vec.h" + +#undef FUNCNAME + +#ifdef DO_INLINE +# ifdef OPERATION_addmul_1 +# define ADD +# define FUNCNAME inline_addmul_1 +# elif defined(OPERATION_mul_1) +# define FUNCNAME inline_mul_1 +# endif + +#else +# ifdef OPERATION_addmul_1 +# define ADD +# define FUNCNAME mpn_addmul_1 +# elif defined(OPERATION_mul_1) +# define FUNCNAME mpn_mul_1 +# endif +#endif + +#ifdef DO_INLINE +static inline mp_limb_t +FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) + __attribute__ ((always_inline)); + +static inline +#endif +mp_limb_t +FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) +{ + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n)); + + /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in + VRs (using each VR as a single 128-bit accumulator). + The inner loop is unrolled to four limbs, with two blocks of four + multiplications each. Since the MLGR operation operates on even/odd GPR + pairs, pin the products appropriately. */ + + /* products as GPR pairs */ + register mp_limb_t p0_high asm("r0"); + register mp_limb_t p0_low asm("r1"); + + register mp_limb_t p1_high asm("r8"); + register mp_limb_t p1_low asm("r9"); + + register mp_limb_t p2_high asm("r6"); + register mp_limb_t p2_low asm("r7"); + + register mp_limb_t p3_high asm("r10"); + register mp_limb_t p3_low asm("r11"); + + /* carry flag for 128-bit add in VR for first carry chain */ + vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; + mp_limb_t carry_limb = 0; + +#ifdef ADD + /* 2nd carry flag for 2nd carry chain with addmul */ + vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; + vec_t sum0; + vec_t rp0_addend, rp1_addend; + rp0_addend.dw = vec_splat_u64 (0); + rp1_addend.dw = vec_splat_u64 (0); +#endif + vec_t sum1; + + vec_t carry_prod = { .dw = vec_splat_u64 (0) }; + + /* The scalar multiplications compete with pointer and index increments for + * issue ports. Thus, increment the loop index in the middle of the loop so + * that the operations for the next iteration's multiplications can be + * loaded in time (looks horrible, yet helps performance) and make sure we + * use addressing with base reg + index reg + immediate displacement + * (so that only the single index needs incrementing, instead of multiple + * pointers). */ +#undef LOOP_ADVANCE +#undef IDX_OFFSET + +#define LOOP_ADVANCE 4 * sizeof (mp_limb_t) +#define IDX_OFFSET (LOOP_ADVANCE) + register ssize_t idx = 0 - IDX_OFFSET; + + /* + * branch-on-count implicitly hint to the branch prediction as taken, while + * compare-and-branch hints as not taken. currently, using branch-on-count + * has a performance advantage, but it is not clear that it is generally the + * better choice (e.g., branch-on-count requires decrementing the separate + * counter). so, allow switching the loop condition to enable either + * category of branch instructions: + * - idx is less than an upper bound, for compare-and-branch + * - iteration counter greater than zero, for branch-on-count + */ +#define BRCTG +#ifdef BRCTG + ssize_t iterations = (size_t)n / 4; +#else + ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; +#endif + + /* products will be transferred into VRs before adding up. + * see main loop below for comments on accumulation scheme. */ + vec_t product0, product1, product2; + + product0.dw = vec_splat_u64 (0); + + switch ((size_t)n % 4) + { + case 0: + break; + + case 1: + idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET; + + p3_low = s1p[0]; + s390_umul_ppmm (p3_high, p3_low, s2limb); + +#ifdef ADD + rp0_addend.dw[1] = rp[0]; + product0.dw[1] = p3_low; + + sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); + carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); + + rp[0] = sum0.dw[1]; +#else + rp[0] = p3_low; +#endif + + carry_limb = p3_high; + break; + + case 2: + p0_low = s1p[0]; + p3_low = s1p[1]; + idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET; + + s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb); + + carry_prod.dw[0] = p3_low; + + product0.dw = vec_load_2di_as_pair (p0_high, p0_low); + + carry_limb = p3_high; + +#ifdef ADD + rp0_addend = vec_load_elements_reversed (rp, 0); + sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); + carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); + + sum1.sw = vec_add_u128 (sum0.sw, product0.sw); + carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw); +#else + sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw); + carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw); +#endif + + vec_store_elements_reversed (rp, 0, sum1); + + break; + + case 3: + idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET; + + p0_low = s1p[0]; + s390_umul_ppmm (p0_high, p0_low, s2limb); + +#ifdef ADD + rp0_addend.dw[1] = rp[0]; + product0.dw[1] = p0_low; + + sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); + carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); + + rp[0] = sum0.dw[1]; +#else + rp[0] = p0_low; +#endif + carry_limb = p0_high; + + p1_low = s1p[1]; + p3_low = s1p[2]; + + s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb); + + carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb); + product1.dw = vec_load_2di_as_pair (p1_high, p1_low); + carry_limb = p3_high; + +#ifdef ADD + rp0_addend = vec_load_elements_reversed (rp, 8); + sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); + carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); + + sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw); + carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw); +#else + sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); + carry_vec0.sw + = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); +#endif + vec_store_elements_reversed (rp, 8, sum1); + break; + } + +#ifdef BRCTG + for (; iterations > 0; iterations--) + { +#else + while (idx < idx_bound) + { +#endif + vec_t overlap_addend0; + vec_t overlap_addend1; + + /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the + * result in a GPR pair. One of the factors is taken from the GPR pair + * and overwritten. + * To reuse factors, it turned out cheaper to load limbs multiple times + * than copying GPR contents. Enforce that and the use of addressing by + * base + index gpr + immediate displacement via inline asm. + */ + ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET); + ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET); + ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET); + ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET); + + /* + * accumulate products as follows (for addmul): + * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] | + * p0_high | p0_low | + * p1_high | p1_low | carry-limb in + * p2_high | p2_low | + * c-limb out <- p3_high | p3_low | + * | < 128-bit VR > < 128-bit VR > + * + * < rp1_addend > < rp0_addend > + * carry-chain 0 <- + <- + <- carry_vec0[127] + * < product1 > < product0 > + * carry-chain 1 <- + <- + <- carry_vec1[127] + * < overlap_addend1 > < overlap_addend0 > + * + * note that a 128-bit add with carry in + out is built from two insns + * - vec_adde_u128 (vacq) provides sum + * - vec_addec_u128 (vacccq) provides the new carry bit + */ + + s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb); + + /* + * "barrier" to enforce scheduling loads for all limbs and first round + * of MLGR before anything else. + */ + asm volatile(""); + + product0.dw = vec_load_2di_as_pair (p0_high, p0_low); + +#ifdef ADD + rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET); + rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET); +#endif + /* increment loop index to unblock dependant loads of limbs for the next + * iteration (see above at #define LOOP_ADVANCE) */ + idx += LOOP_ADVANCE; + + s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb); + + overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb); + asm volatile(""); + +#ifdef ADD + sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); + sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); + + carry_vec0.sw + = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); + carry_vec1.sw + = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); +#else + sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); + carry_vec0.sw + = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); +#endif + + asm volatile(""); + product2.dw = vec_load_2di_as_pair (p2_high, p2_low); + overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high); + + vec_t sum4; + +#ifdef ADD + vec_t sum3; + sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); + sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); + + carry_vec0.sw + = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); + carry_vec1.sw + = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); +#else + sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); + carry_vec0.sw + = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); +#endif + vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE, + sum1); + vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, + sum4); + + carry_limb = p3_high; + } + +#ifdef ADD + carry_vec0.dw += carry_vec1.dw; + carry_limb += carry_vec0.dw[1]; +#else + carry_limb += carry_vec0.dw[1]; +#endif + + return carry_limb; +} + +#undef OPERATION_addmul_1 +#undef OPERATION_mul_1 +#undef FUNCNAME +#undef ADD diff --git a/gmp-6.3.0/mpn/s390_64/z13/aormul_2.c b/gmp-6.3.0/mpn/s390_64/z13/aormul_2.c new file mode 100644 index 0000000..9a69fc3 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/aormul_2.c @@ -0,0 +1,476 @@ +/* Addmul_2 / mul_2 for IBM z13 or later + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#include "s390_64/z13/common-vec.h" + +#undef FUNCNAME + +#ifdef DO_INLINE +# ifdef OPERATION_addmul_2 +# define ADD +# define FUNCNAME inline_addmul_2 +# elif defined(OPERATION_mul_2) +# define FUNCNAME inline_mul_2 +# else +# error Missing define for operation to perform +# endif +#else +# ifdef OPERATION_addmul_2 +# define ADD +# define FUNCNAME mpn_addmul_2 +# elif defined(OPERATION_mul_2) +# define FUNCNAME mpn_mul_2 +# else +# error Missing define for operation to perform +# endif +#endif + +#ifdef DO_INLINE +static inline mp_limb_t +FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, const mp_limb_t *vp) + __attribute__ ((always_inline)); + +static inline +#endif +mp_limb_t +FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, + const mp_limb_t *vp) +{ + + /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in + VRs (using each VR as a single 128-bit accumulator). + The inner loop is unrolled to four limbs, with two blocks of four + multiplications each. Since the MLGR operation operates on even/odd GPR + pairs, pin the products appropriately. */ + + register mp_limb_t p0_high asm("r0"); + register mp_limb_t p0_low asm("r1"); + + register mp_limb_t p1_high asm("r8"); + register mp_limb_t p1_low asm("r9"); + + register mp_limb_t p2_high asm("r6"); + register mp_limb_t p2_low asm("r7"); + + register mp_limb_t p3_high asm("r10"); + register mp_limb_t p3_low asm("r11"); + + vec_t carry_prod = { .dw = vec_splat_u64 (0) }; + vec_t zero = { .dw = vec_splat_u64 (0) }; + + /* two carry-bits for the 128-bit VR adds - stored in VRs */ +#ifdef ADD + vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; +#endif + vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; + + vec_t tmp; + + vec_t sum0, sum1; + + /* products transferred into VRs for accumulating there */ + vec_t pv0, pv3; + vec_t pv1_low, pv1_high, pv2_low, pv2_high; + vec_t low, middle, high; +#ifdef ADD + vec_t rp0, rp1; +#endif + + register mp_limb_t v0 asm("r12"); + register mp_limb_t v1 asm("r5"); + v0 = vp[0]; + v1 = vp[1]; + + /* The scalar multiplications compete with pointer and index increments for + * issue ports. Thus, increment the loop index in the middle of the loop so + * that the operations for the next iteration's multiplications can be + * loaded in time (looks horrible, yet helps performance) and make sure we + * use addressing with base reg + index reg + immediate displacement + * (so that only the single index needs incrementing, instead of multiple + * pointers). */ +#undef LOOP_ADVANCE +#define LOOP_ADVANCE (4 * sizeof (mp_limb_t)) +#define IDX_OFFSET (LOOP_ADVANCE) + + register ssize_t idx = 0 - IDX_OFFSET; +#ifdef BRCTG + ssize_t iterations = (size_t)n / 4; +#else + ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; +#endif + + /* + * To minimize latency in the carry chain, accumulate in VRs with 128-bit + * adds with carry in and out. As a downside, these require two insns for + * each add - one to calculate the sum, one to deliver the carry out. + * To reduce the overall number of insns to execute, combine adding up + * product limbs such that there cannot be a carry out and one (for mul) or + * two (for addmul) adds with carry chains. + * + * Since (2^64-1) * (2^64-1) = (2^128-1) - 2 * (2^64-1), we can add two + * limbs into each 128-bit product without causing carry out. + * + * For each block of 2 limbs * 2 limbs + * + * | u[i] * v[0] (p2) | + * | u[i] * v[1] (p0) | + * | u[i+1] * v[0](p1) | + * | u[i+1] * v[1](p3) | + * < 128 bits > < 128 bits > + * + * we can begin accumulating with "simple" carry-oblivious 128-bit adds: + * - p0 + low limb of p1 + * + high limb of p2 + * and combine resulting low limb with p2's low limb + * - p3 + high limb of p1 + * + high limb of sum above + * ... which will will result in two 128-bit limbs to be fed into the carry + * chain(s). + * Overall, that scheme saves instructions and improves performance, despite + * slightly increasing latency between multiplications and carry chain (yet + * not in the carry chain). + */ + +#define LOAD_LOW_LIMB(VEC, LIMB) \ + do \ + { \ + asm("vzero\t%[vec]\n\t" \ + "vlvgg\t%[vec],%[limb],1" \ + : [vec] "=v"(VEC) \ + : [limb] "r"(LIMB)); \ + } \ + while (0) + + /* for the 128-bit adds in the carry chain, to calculate a + b + carry-in we + * need paired vec_adde_u128 (delivers sum) and vec_addec_u128 (delivers new + * carry) */ +#define ADD_UP2_CARRY_INOUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \ + do \ + { \ + sum##SUMIDX.sw \ + = vec_adde_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \ + carry_vec##CARRYIDX.sw \ + = vec_addec_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \ + } \ + while (0) + +#define ADD_UP_CARRY_INOUT(SUMIDX, ADDEND1, ADDEND2) \ + ADD_UP2_CARRY_INOUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2) + + /* variant without carry-in for prologue */ +#define ADD_UP2_CARRY_OUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \ + do \ + { \ + sum##SUMIDX.sw = vec_add_u128 (ADDEND1.sw, ADDEND2.sw); \ + carry_vec##CARRYIDX.sw = vec_addc_u128 (ADDEND1.sw, ADDEND2.sw); \ + } \ + while (0) + +#define ADD_UP_CARRY_OUT(SUMIDX, ADDEND1, ADDEND2) \ + ADD_UP2_CARRY_OUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2) + + /* prologue for 4x-unrolled main loop */ + switch ((size_t)n % 4) + { + case 1: + ASM_LOADGPR_BASE (p0_low, up, 0); + ASM_LOADGPR_BASE (p1_low, up, 0); + s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1); + carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low); + +/* gcc tries to be too clever and vlr from a reg that is already zero. vzero is + * cheaper. */ +# define NEW_CARRY(VEC, LIMB) \ + do \ + { \ + asm("vzero\t%[vec]\n\t" \ + "vlvgg\t%[vec],%[limb],1" \ + : [vec] "=v"(VEC) \ + : [limb] "r"(LIMB)); \ + } \ + while (0) + + NEW_CARRY (tmp, p0_high); + + carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw); +#ifdef ADD + carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp); +#else + rp[0] = p0_low; +#endif + idx += sizeof (mp_limb_t); + break; + + case 2: + ASM_LOADGPR_BASE (p0_low, up, 0); + ASM_LOADGPR_BASE (p1_low, up, 8); + ASM_LOADGPR_BASE (p2_low, up, 0); + ASM_LOADGPR_BASE (p3_low, up, 8); + + asm("" + : "=r"(p0_low), "=r"(p2_low) + : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low)); + s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); + s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); + + pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); + LOAD_LOW_LIMB (pv1_low, p1_low); + LOAD_LOW_LIMB (pv1_high, p1_high); + pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); + LOAD_LOW_LIMB (pv2_high, p2_high); + pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); + LOAD_LOW_LIMB (pv2_low, p2_low); + pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); + middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); + low.dw = vec_permi (middle.dw, pv2_low.dw, 3); + middle.dw = vec_permi (zero.dw, middle.dw, 0); + high.sw = vec_add_u128 (middle.sw, pv3.sw); +#ifdef ADD + rp0 = vec_load_elements_reversed (rp, 0); + ADD_UP_CARRY_OUT (0, rp0, carry_prod); +#else + sum0 = carry_prod; +#endif + ADD_UP_CARRY_OUT (1, sum0, low); + vec_store_elements_reversed (rp, 0, sum1); + carry_prod = high; + + idx += 2 * sizeof (mp_limb_t); + break; + + case 3: + ASM_LOADGPR_BASE (p0_low, up, 0); + ASM_LOADGPR_BASE (p1_low, up, 0); + s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1); + carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low); + NEW_CARRY (tmp, p0_high); + carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw); + +#ifdef ADD + carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp); +#else + rp[0] = p0_low; +#endif + + ASM_LOADGPR_BASE (p0_low, up, 8); + ASM_LOADGPR_BASE (p1_low, up, 16); + ASM_LOADGPR_BASE (p2_low, up, 8); + ASM_LOADGPR_BASE (p3_low, up, 16); + + asm("" + : "=r"(p0_low), "=r"(p2_low) + : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low)); + s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); + s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); + + pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); + + LOAD_LOW_LIMB (pv1_low, p1_low); + LOAD_LOW_LIMB (pv1_high, p1_high); + + pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); + LOAD_LOW_LIMB (pv2_high, p2_high); + pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); + + LOAD_LOW_LIMB (pv2_low, p2_low); + + pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); + middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); + + low.dw = vec_permi (middle.dw, pv2_low.dw, 3); + middle.dw = vec_permi (zero.dw, middle.dw, 0); + high.sw = vec_add_u128 (middle.sw, pv3.sw); + +#ifdef ADD + vec_t rp0 = vec_load_elements_reversed (rp, 8); + ADD_UP_CARRY_OUT (0, rp0, carry_prod); +#else + sum0 = carry_prod; +#endif + ADD_UP_CARRY_INOUT (1, sum0, low); + + vec_store_elements_reversed (rp, 8, sum1); + + carry_prod = high; + + idx += 3 * sizeof (mp_limb_t); + break; + } + + /* + * branch-on-count implicitly hint to the branch prediction as taken, while + * compare-and-branch hints as not taken. currently, using branch-on-count + * has a performance advantage, but it is not clear that it is generally + * the better choice (e.g., branch-on-count requires decrementing the + * separate counter). so, allow switching the loop condition to enable + * either category of branch instructions: + * - idx is less than an upper bound, for compare-and-branch + * - iteration counter greater than zero, for branch-on-count + */ +#ifdef BRCTG + for (; iterations > 0; iterations--) + { +#else + while (idx < idx_bound) + { +#endif + /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the + * result in a GPR pair. One of the factors is taken from the GPR pair + * and overwritten. + * To reuse factors, it turned out cheaper to load limbs multiple times + * than copying GPR contents. Enforce that and the use of addressing by + * base + index gpr + immediate displacement via inline asm. + */ + ASM_LOADGPR (p0_low, up, idx, 0 + IDX_OFFSET); + ASM_LOADGPR (p1_low, up, idx, 8 + IDX_OFFSET); + ASM_LOADGPR (p2_low, up, idx, 0 + IDX_OFFSET); + ASM_LOADGPR (p3_low, up, idx, 8 + IDX_OFFSET); + + s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); + + pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); + + LOAD_LOW_LIMB (pv1_low, p1_low); + LOAD_LOW_LIMB (pv1_high, p1_high); + + s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); + + pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); + LOAD_LOW_LIMB (pv2_high, p2_high); + pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); + + LOAD_LOW_LIMB (pv2_low, p2_low); + + ASM_LOADGPR (p0_low, up, idx, 16 + IDX_OFFSET); + ASM_LOADGPR (p1_low, up, idx, 24 + IDX_OFFSET); + ASM_LOADGPR (p2_low, up, idx, 16 + IDX_OFFSET); + ASM_LOADGPR (p3_low, up, idx, 24 + IDX_OFFSET); + + idx += LOOP_ADVANCE; + + /* + * "barrier" to enforce scheduling the index increment before the second + * block of multiplications. not required for clang. + */ +#ifndef __clang__ + asm("" + : "=r"(idx), "=r"(p0_high), "=r"(p2_high) + : "0"(idx), "1"(p0_high), "2"(p2_high)); +#endif + + s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); + s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); + + /* + * "barrier" to enforce scheduling all MLGRs first, before any adding + * up. note that clang produces better code without. + */ +#ifndef __clang__ + asm("" + : "=v"(pv0.sw), "=v"(pv3.sw) + : "1"(pv3.sw), "0"(pv0.sw), "r"(p0_high), "r"(p2_high)); +#endif + + pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); + middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); + + low.dw = vec_permi (middle.dw, pv2_low.dw, + 3); /* least-significant doubleword from both vectors */ + middle.dw = vec_permi (zero.dw, middle.dw, 0); + high.sw = vec_add_u128 (middle.sw, pv3.sw); + +#ifdef ADD + rp0 = vec_load_elements_reversed_idx (rp, idx, + 0 + IDX_OFFSET - LOOP_ADVANCE); + ADD_UP_CARRY_INOUT (0, rp0, carry_prod); +#else + sum0 = carry_prod; +#endif + ADD_UP_CARRY_INOUT (1, sum0, low); + + vec_store_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET - LOOP_ADVANCE, + sum1); + + carry_prod = high; + + vec_t pv0_2, pv3_2; + vec_t pv1_low_2, pv1_high_2, pv2_low_2, pv2_high_2; + vec_t low_2, middle_2, high_2; + vec_t sum2, sum3; + + pv0_2.dw = vec_load_2di_as_pair (p0_high, p0_low); + LOAD_LOW_LIMB (pv1_low_2, p1_low); + LOAD_LOW_LIMB (pv1_high_2, p1_high); + + pv0_2.sw = vec_add_u128 (pv0_2.sw, pv1_low_2.sw); + LOAD_LOW_LIMB (pv2_high_2, p2_high); + pv3_2.dw = vec_load_2di_as_pair (p3_high, p3_low); + pv3_2.sw = vec_add_u128 (pv3_2.sw, pv1_high_2.sw); + middle_2.sw = vec_add_u128 (pv0_2.sw, pv2_high_2.sw); + + LOAD_LOW_LIMB (pv2_low_2, p2_low); + low_2.dw + = vec_permi (middle_2.dw, pv2_low_2.dw, + 3); /* least-significant doubleword from both vectors */ + middle_2.dw = vec_permi (zero.dw, middle_2.dw, 0); + high_2.sw = vec_add_u128 (middle_2.sw, pv3_2.sw); + + /* + * another "barrier" to influence scheduling. (also helps in clang) + */ + asm("" : : "v"(pv0_2.sw), "r"(p2_high), "r"(p3_high), "v"(pv3_2.sw)); + +#ifdef ADD + rp1 = vec_load_elements_reversed_idx (rp, idx, + 16 + IDX_OFFSET - LOOP_ADVANCE); + ADD_UP2_CARRY_INOUT (2, 0, rp1, carry_prod); +#else + sum2 = carry_prod; +#endif + ADD_UP2_CARRY_INOUT (3, 1, sum2, low_2); + + vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, + sum3); + + carry_prod = high_2; + } + +#ifdef ADD + sum0.sw = vec_adde_u128 (carry_prod.sw, carry_vec0.sw, carry_vec1.sw); +#else + sum0.sw = vec_add_u128 (carry_prod.sw, carry_vec1.sw); +#endif + + *(mp_ptr) (((char *)rp) + idx + 0 + IDX_OFFSET) = (mp_limb_t)sum0.dw[1]; + + return (mp_limb_t)sum0.dw[0]; +} diff --git a/gmp-6.3.0/mpn/s390_64/z13/common-vec.h b/gmp-6.3.0/mpn/s390_64/z13/common-vec.h new file mode 100644 index 0000000..a59e6ee --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/common-vec.h @@ -0,0 +1,175 @@ +/* Common vector helpers and macros for IBM z13 and later + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#ifndef __S390_64_Z13_COMMON_VEC_H +#define __S390_64_Z13_COMMON_VEC_H + +#include +#include + +/* + * Vector intrinsics use vector element types that kind-of make sense for the + * specific operation (e.g., vec_permi permutes doublewords). To use VRs + * interchangeably with different intrinsics, typedef the two variants and wrap + * them in a union. + */ +#define VLEN_BYTES 16 +typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES))); +typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES))); + +/* + * The Z vector intrinsics use vectors with different element types (e.g., + * v16qi for the 128-bit adds and v2di for vec_permi). + */ +union vec +{ + v2di dw; + v16qi sw; +}; + +typedef union vec vec_t; + +/* + * single-instruction combine of two GPRs into a VR + */ +static inline v2di +vec_load_2di_as_pair (unsigned long a, unsigned long b) +{ + v2di res; + __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b)); + return res; +} + +/* + * 64x64 mult where caller needs to care about proper register allocation: + * multiply xl with m1, treating both as unsigned, and place the result in + * xh:xl. + * mlgr operates on register pairs, so xh must be an even gpr followed by xl + */ +#define s390_umul_ppmm(xh, xl, m1) \ + do \ + { \ + asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1)); \ + } \ + while (0); + +/* + * two 64x64 multiplications, scheduled so that they will dispatch and issue to + * different sides: each mlgr is dispatched alone in an instruction group and + * subsequent groups will issue on different execution sides. + * there is a variant where both products use the same multiplicand and one + * that uses two different multiplicands. constraints from s390_umul_ppmm apply + * here. + */ +#define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX) \ + do \ + { \ + asm("mlgr\t%[x0h],%[mx]\n\t" \ + "mlgr\t%[x1h],%[mx]" \ + : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ + [x1l] "=r"(X1L) \ + : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX)); \ + } \ + while (0); + +#define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1) \ + do \ + { \ + asm("mlgr\t%[x0h],%[mx0]\n\t" \ + "mlgr\t%[x1h],%[mx1]" \ + : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ + [x1l] "=r"(X1L) \ + : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1)); \ + } \ + while (0); + +#define ASM_LOADGPR_BASE(DST, BASE, OFFSET) \ + asm volatile("lg\t%[r],%[off](%[b])" \ + : [r] "=r"(DST) \ + : [b] "a"(BASE), [off] "L"(OFFSET) \ + : "memory"); + +#define ASM_LOADGPR(DST, BASE, INDEX, OFFSET) \ + asm volatile("lg\t%[r],%[off](%[b],%[x])" \ + : [r] "=r"(DST) \ + : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET) \ + : "memory"); + +/* + * Load a vector register from memory and swap the two 64-bit doubleword + * elements. + */ +static inline vec_t +vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index, + ssize_t const offset) +{ + vec_t res; + char *ptr = (char *)base; + + res.sw = *(v16qi *)(ptr + index + offset); + res.dw = vec_permi (res.dw, res.dw, 2); + + return res; +} + +static inline vec_t +vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset) +{ + return vec_load_elements_reversed_idx (base, 0, offset); +} + +/* + * Store a vector register to memory and swap the two 64-bit doubleword + * elements. + */ +static inline void +vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index, + ssize_t const offset, vec_t vec) +{ + char *ptr = (char *)base; + + vec.dw = vec_permi (vec.dw, vec.dw, 2); + *(v16qi *)(ptr + index + offset) = vec.sw; +} + +static inline void +vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec) +{ + vec_store_elements_reversed_idx (base, 0, offset, vec); +} + +#define ASM_VZERO(VEC) \ + do \ + { \ + asm("vzero\t%[vec]" : [vec] "=v"(VEC)); \ + } \ + while (0) + +#endif diff --git a/gmp-6.3.0/mpn/s390_64/z13/gmp-mparam.h b/gmp-6.3.0/mpn/s390_64/z13/gmp-mparam.h new file mode 100644 index 0000000..50e7f39 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/gmp-mparam.h @@ -0,0 +1,162 @@ +/* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#define HAVE_NATIVE_mpn_addmul_2 1 +#define HAVE_NATIVE_mpn_mul_2 1 + +/* Generated by tuneup.c, 2021-07-30, gcc 10.2 */ + +#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 17 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 15 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_1N_PI1_METHOD 3 +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 996 +#define DIVEXACT_1_THRESHOLD 4 +#define BMOD_1_TO_MOD_1_THRESHOLD 0 /* always */ + +#define DIV_1_VS_MUL_1_PERCENT 404 + +#define MUL_TOOM22_THRESHOLD 23 +#define MUL_TOOM33_THRESHOLD 94 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 626 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 143 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 12 +#define SQR_TOOM3_THRESHOLD 84 +#define SQR_TOOM4_THRESHOLD 234 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 42 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 7 + +#define MUL_FFT_MODF_THRESHOLD 332 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 332, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 47,11}, { 2048,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 47 +#define MUL_FFT_THRESHOLD 2752 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 8, 4}, { 17, 5}, { 13, 6}, \ + { 7, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 10, 6}, \ + { 21, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 21, 9}, { 11, 8}, { 23, 9}, { 15, 8}, \ + { 31, 9}, { 19, 8}, { 39, 9}, { 23,10}, \ + { 15, 9}, { 39,10}, { 23,11}, { 15,10}, \ + { 31, 9}, { 63,10}, { 47,11}, { 2048,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 52 +#define SQR_FFT_THRESHOLD 1856 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 25 +#define MULLO_MUL_N_THRESHOLD 5397 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 396 +#define SQRLO_SQR_THRESHOLD 3704 + +#define DC_DIV_QR_THRESHOLD 15 +#define DC_DIVAPPR_Q_THRESHOLD 50 +#define DC_BDIV_QR_THRESHOLD 66 +#define DC_BDIV_Q_THRESHOLD 202 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 29 +#define INV_APPR_THRESHOLD 13 + +#define BINV_NEWTON_THRESHOLD 312 +#define REDC_1_TO_REDC_2_THRESHOLD 79 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 13 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1367 + +#define POWM_SEC_TABLE 3,19,215,1730 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 15 +#define SET_STR_DC_THRESHOLD 882 +#define SET_STR_PRECOMPUTE_THRESHOLD 2520 + +#define FAC_DSC_THRESHOLD 228 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 +#define HGCD_THRESHOLD 61 +#define HGCD_APPR_THRESHOLD 51 +#define HGCD_REDUCE_THRESHOLD 1962 +#define GCD_DC_THRESHOLD 217 +#define GCDEXT_DC_THRESHOLD 263 +#define JACOBI_BASE_METHOD 4 + diff --git a/gmp-6.3.0/mpn/s390_64/z13/hamdist.asm b/gmp-6.3.0/mpn/s390_64/z13/hamdist.asm new file mode 100644 index 0000000..81c5174 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/hamdist.asm @@ -0,0 +1,76 @@ +dnl S/390-64 mpn_hamdist + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 - +C z990 - +C z9 - +C z10 - +C z196 - +C z12 ? +C z13 ? +C z14 ? +C z15 ? + +define(`ap', `%r2') +define(`bp', `%r3') +define(`n', `%r4') + +ASM_START() +PROLOGUE(mpn_hamdist) + vzero %v30 + tmll n, 1 + srlg n, n, 1 + je L(top) + +L(odd): vllezg %v16, 0(ap) + vllezg %v17, 0(bp) + vx %v16, %v16, %v17 + vpopct %v30, %v16, 3 + la ap, 8(ap) + la bp, 8(bp) + clgije n, 0, L(end) + +L(top): vl %v16, 0(ap), 3 + vl %v17, 0(bp), 3 + vx %v16, %v16, %v17 + vpopct %v20, %v16, 3 + vag %v30, %v30, %v20 + la ap, 16(ap) + la bp, 16(bp) + brctg n, L(top) + +L(end): vzero %v29 + vsumqg %v30, %v30, %v29 + vlgvg %r2, %v30, 1(%r0) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/z13/mul_1.asm b/gmp-6.3.0/mpn/s390_64/z13/mul_1.asm new file mode 100644 index 0000000..04eb718 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/mul_1.asm @@ -0,0 +1,149 @@ +dnl S/390-64 mpn_mul_1 and mpn_mul_1c. +dnl Based on C code contributed by Marius Hillenbrand. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl TODO +dnl * Schedule vlvgp away from mlgr; that saves 20% of the run time. +dnl * Perhaps use vp[0]/vp[1] in innerloop instead preloading v0/v1. + +C cycles/limb +C z900 - +C z990 - +C z9 - +C z10 - +C z196 - +C z12 ? +C z13 ? +C z14 ? +C z15 2.25 + + +define(`rp', `%r2') +define(`ap', `%r3') +define(`an', `%r4') +define(`b0', `%r5') +define(`cy', `%r6') + +define(`idx', `%r4') + +ASM_START() + +PROLOGUE(mpn_mul_1c) + stmg %r6, %r13, 48(%r15) + j L(ent) +EPILOGUE() + +PROLOGUE(mpn_mul_1) + stmg %r6, %r13, 48(%r15) + lghi %r6, 0 +L(ent): vzero %v2 + srlg %r11, an, 2 + + tmll an, 1 + je L(bx0) +L(bx1): tmll an, 2 + jne L(b11) + +L(b01): lghi idx, -24 + lg %r13, 0(ap) + mlgr %r12, b0 + algr %r13, %r6 + lghi %r6, 0 + alcgr %r12, %r6 + stg %r13, 0(rp) + cgije %r11, 0, L(1) + j L(cj0) + +L(b11): lghi idx, -8 + lg %r9, 0(ap) + mlgr %r8, b0 + algr %r9, %r6 + lghi %r6, 0 + alcgr %r8, %r6 + stg %r9, 0(rp) + j L(cj1) + +L(bx0): tmll an, 2 + jne L(b10) +L(b00): lghi idx, -32 + lgr %r12, %r6 +L(cj0): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 + j L(mid) + +L(b10): lghi idx, -16 + lgr %r8, %r6 +L(cj1): lg %r7, 16(idx, ap) + lg %r13, 24(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + cgije %r11, 0, L(end) + +L(top): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vacq %v3, %v6, %v7, %v2 + vacccq %v2, %v6, %v7, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 +L(mid): lg %r7, 48(idx, ap) + lg %r13, 56(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vacq %v1, %v6, %v7, %v2 + vacccq %v2, %v6, %v7, %v2 + vpdi %v1, %v1, %v1, 4 + vst %v1, 32(idx, rp), 3 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + la idx, 32(idx) + brctg %r11, L(top) + +L(end): vacq %v3, %v6, %v7, %v2 + vacccq %v2, %v6, %v7, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + +L(1): vlgvg %r2, %v2, 1 + agr %r2, %r12 + lmg %r6, %r13, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/z13/mul_1.c b/gmp-6.3.0/mpn/s390_64/z13/mul_1.c new file mode 100644 index 0000000..7584dc8 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/mul_1.c @@ -0,0 +1,31 @@ +/* mul_1 for IBM z13 or later + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "s390_64/z13/addmul_1.c" diff --git a/gmp-6.3.0/mpn/s390_64/z13/mul_2.asm b/gmp-6.3.0/mpn/s390_64/z13/mul_2.asm new file mode 100644 index 0000000..ec61201 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/mul_2.asm @@ -0,0 +1,121 @@ +dnl S/390-64 mpn_mul_2 + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 - +C z990 - +C z9 - +C z10 ? +C z196 ? +C z12 ? +C z13 ? +C z14 ? +C z15 2.8 + + +define(`rp', `%r2') +define(`up', `%r3') +define(`un', `%r4') +define(`vp', `%r5') + +define(`idx', `%r12') +define(`v0', `%r11') +define(`v1', `%r5') + +ASM_START() +PROLOGUE(mpn_mul_2) + stmg %r6, %r12, 48(%r15) + + vzero %v27 + vzero %v28 + vzero %v29 + vzero %v30 + lghi %r10, 0 + lg v0, 0(vp) + lg v1, 8(vp) + tmll un, 1 + srlg un, un, 1 + je L(evn) + +L(odd): lg %r7, 0(up) + mlgr %r6, v0 C W2 W1 + lg %r1, 0(up) + stg %r7, 0(rp) + lghi idx, 8 +dnl clgije un, 0, L(end) + j L(top) + +L(evn): lghi %r6, 0 + lghi idx, 0 + lghi %r1, 0 + +L(top): lg %r9, 0(idx, up) + mlgr %r0, v1 C W2 W1 + mlgr %r8, v1 C W3 W2 + vlvgp %v22, %r0, %r1 C W2 W1 + vlvgp %v23, %r9, %r6 C W2 W1 + lg %r1, 0(idx, up) + lg %r7, 8(idx, up) + mlgr %r0, v0 C W2 W1 + mlgr %r6, v0 C W3 W2 + vlvgp %v20, %r0, %r1 C W2 W1 + vlvgp %v21, %r7, %r10 C W2 W1 + vacq %v24, %v22, %v23, %v27 C + vacccq %v27, %v22, %v23, %v27 C carry critical path 1 + vacq %v23, %v24, %v20, %v28 C + vacccq %v28, %v24, %v20, %v28 C carry critical path 2 + vacq %v20, %v23, %v21, %v29 C + vacccq %v29, %v23, %v21, %v29 C carry critical path 3 + vpdi %v20, %v20, %v20, 4 + lg %r1, 8(idx, up) + vst %v20, 0(idx, rp), 3 + lgr %r10, %r8 + la idx, 16(idx) + brctg un, L(top) + +L(end): mlgr %r0, v1 + algr %r1, %r6 + alcgr %r0, un + algr %r1, %r8 + alcgr %r0, un + vag %v27, %v27, %v28 + vag %v29, %v29, %v30 + vag %v27, %v27, %v29 + vlgvg %r10, %v27, 1 + algr %r1, %r10 + stg %r1, 0(idx, rp) + alcgr %r0, un + lgr %r2, %r0 + + lmg %r6, %r12, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/z13/mul_basecase.asm b/gmp-6.3.0/mpn/s390_64/z13/mul_basecase.asm new file mode 100644 index 0000000..0de1150 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/mul_basecase.asm @@ -0,0 +1,264 @@ +dnl S/390-64 mpn_mul_basecase. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C INPUT PARAMETERS +define(`rp', `%r2') +define(`ap', `%r3') +define(`an', `%r4') C 32 +define(`bp', `%r5') C 40 +define(`bn', `%r6') C 48 + +define(`idx', `%r14') +define(`b0', `%r10') + +dnl live in addmul_1: +dnl r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 +dnl xx xx rp ap an bp xx xx xx xx b0 i xx xx idx +dnl stack: bn + +dnl TODO +dnl * Have mul_1 start without initial (un mod 4) separation, instead handle +dnl after loop. Then fall into 4 separate addmul_1 loops. +dnl * Streamline handling of bn, an, %r11 to reduce the # if memops. + +define(`MUL_1',` +pushdef(`L', +defn(`L')$1`'_m1) + vzero %v2 + srlg %r11, %r0, 2 + + tmll %r0, 1 + je L(bx0) +L(bx1): tmll %r0, 2 + jne L(b11) + +L(b01): lghi idx, -24 + lg %r13, 0(ap) + mlgr %r12, b0 + stg %r13, 0(rp) + cgijne %r11, 0, L(cj0) + +L(1): stg %r12, 8(rp) + lmg %r6, %r14, 48(%r15) + br %r14 + +L(b11): lghi idx, -8 + lg %r9, 0(ap) + mlgr %r8, b0 + stg %r9, 0(rp) + j L(cj1) + +L(bx0): tmll %r0, 2 + jne L(b10) +L(b00): lghi idx, -32 + lghi %r12, 0 +L(cj0): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 + j L(mid) + +L(b10): lghi idx, -16 + lghi %r8, 0 +L(cj1): lg %r7, 16(idx, ap) + lg %r13, 24(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + cgije %r11, 0, L(end) + +L(top): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vacq %v3, %v6, %v7, %v2 + vacccq %v2, %v6, %v7, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 +L(mid): lg %r7, 48(idx, ap) + lg %r13, 56(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vacq %v1, %v6, %v7, %v2 + vacccq %v2, %v6, %v7, %v2 + vpdi %v1, %v1, %v1, 4 + vst %v1, 32(idx, rp), 3 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + la idx, 32(idx) + brctg %r11, L(top) + +L(end): vacq %v3, %v6, %v7, %v2 + vacccq %v2, %v6, %v7, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + + vlgvg %r0, %v2, 1 + algr %r0, %r12 + stg %r0, 32(idx, rp) +popdef(`L') +') + +define(`ADDMUL_1',` +pushdef(`L', +defn(`L')$1`'_am1) + vzero %v0 + vzero %v2 + srlg %r11, %r0, 2 + + tmll %r0, 1 + je L(bx0) +L(bx1): tmll %r0, 2 + jne L(b11) + +L(b01): lghi idx, -24 + vleg %v2, 0(rp), 1 + lg %r13, 0(ap) + vzero %v4 + mlgr %r12, b0 + vlvgg %v4, %r13, 1 + vaq %v2, %v2, %v4 + vsteg %v2, 0(rp), 1 + vmrhg %v2, %v2, %v2 + j L(cj0) + +L(b11): lghi idx, -8 + vleg %v2, 0(rp), 1 + lg %r9, 0(ap) + vzero %v4 + mlgr %r8, b0 + vlvgg %v4, %r9, 1 + vaq %v2, %v2, %v4 + vsteg %v2, 0(rp), 1 + vmrhg %v2, %v2, %v2 + j L(cj1) + +L(bx0): tmll %r0, 2 + jne L(b10) +L(b00): lghi idx, -32 + lghi %r12, 0 +L(cj0): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 + j L(mid) + +L(b10): lghi idx, -16 + lghi %r8, 0 +L(cj1): lg %r7, 16(idx, ap) + lg %r13, 24(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + cgije %r11, 0, L(end) + +L(top): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vl %v1, 16(idx, rp), 3 + vpdi %v1, %v1, %v1, 4 + vacq %v5, %v6, %v1, %v0 + vacccq %v0, %v6, %v1, %v0 + vacq %v3, %v5, %v7, %v2 + vacccq %v2, %v5, %v7, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 +L(mid): lg %r7, 48(idx, ap) + lg %r13, 56(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vl %v4, 32(idx, rp), 3 + vpdi %v4, %v4, %v4, 4 + vacq %v5, %v6, %v4, %v0 + vacccq %v0, %v6, %v4, %v0 + vacq %v1, %v5, %v7, %v2 + vacccq %v2, %v5, %v7, %v2 + vpdi %v1, %v1, %v1, 4 + vst %v1, 32(idx, rp), 3 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + la idx, 32(idx) + brctg %r11, L(top) + +L(end): vl %v1, 16(idx, rp), 3 + vpdi %v1, %v1, %v1, 4 + vacq %v5, %v6, %v1, %v0 + vacccq %v0, %v6, %v1, %v0 + vacq %v3, %v5, %v7, %v2 + vacccq %v2, %v5, %v7, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + + vag %v2, %v0, %v2 + vlgvg %r0, %v2, 1 + algr %r0, %r12 + stg %r0, 32(idx, rp) +popdef(`L') +') + + +ASM_START() + +PROLOGUE(mpn_mul_basecase) + stmg %r4, %r14, 32(%r15) + + lgr %r4, bn + + lg %r0, 32(%r15) + lg b0, 0(bp) + MUL_1() C implicitly pass r0 = an + + aghi %r4, -1 + je L(end) +L(top): lg %r0, 32(%r15) + la bp, 8(bp) + la rp, 8(rp) + lg b0, 0(bp) + ADDMUL_1() C implicitly pass r0 = an + brctg %r4, L(top) + +L(end): lmg %r6, %r14, 48(%r15) + br %r14 +EPILOGUE() + .section .note.GNU-stack diff --git a/gmp-6.3.0/mpn/s390_64/z13/mul_basecase.c b/gmp-6.3.0/mpn/s390_64/z13/mul_basecase.c new file mode 100644 index 0000000..f1b7160 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/mul_basecase.c @@ -0,0 +1,124 @@ +/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two + natural numbers of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include + +#include "gmp-impl.h" + +/* Note: we explicitly inline all mul and addmul routines here to reduce the + * number of branches in prologues of unrolled functions. That comes at the + cost of duplicating common loop bodies in object code. */ +#define DO_INLINE + +/* + * tweak loop conditions in addmul subroutines to enable use of + * branch-relative-on-count (BRCTG) instructions, which currently results in + * better performance. + */ +#define BRCTG + +#include "s390_64/z13/common-vec.h" + +#define OPERATION_mul_1 +#include "s390_64/z13/addmul_1.c" +#undef OPERATION_mul_1 + +#define OPERATION_addmul_1 +#include "s390_64/z13/addmul_1.c" +#undef OPERATION_addmul_1 + +#define OPERATION_mul_2 +#include "s390_64/z13/aormul_2.c" +#undef OPERATION_mul_2 + +#define OPERATION_addmul_2 +#include "s390_64/z13/aormul_2.c" +#undef OPERATION_addmul_2 + +void +mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, + mp_size_t vn) +{ + ASSERT (un >= vn); + ASSERT (vn >= 1); + ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un)); + ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn)); + + /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch + * for un%4 and inline specific variants. */ + +#define BRANCH_FOR_MOD(N) \ + do \ + { \ + if (vn >= 2) \ + { \ + rp[un + 1] = inline_mul_2 (rp, up, un, vp); \ + rp += 2, vp += 2, vn -= 2; \ + } \ + else \ + { \ + rp[un] = inline_mul_1 (rp, up, un, vp[0]); \ + return; \ + } \ + \ + while (vn >= 2) \ + { \ + rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp); \ + rp += 2, vp += 2, vn -= 2; \ + } \ + \ + while (vn >= 1) \ + { \ + rp[un] = inline_addmul_1 (rp, up, un, vp[0]); \ + rp += 1, vp += 1, vn -= 1; \ + } \ + } \ + while (0); + + switch (((size_t)un) % 4) + { + case 0: + BRANCH_FOR_MOD (0); + break; + case 1: + BRANCH_FOR_MOD (1); + break; + case 2: + BRANCH_FOR_MOD (2); + break; + case 3: + BRANCH_FOR_MOD (3); + break; + } +} diff --git a/gmp-6.3.0/mpn/s390_64/z13/popcount.asm b/gmp-6.3.0/mpn/s390_64/z13/popcount.asm new file mode 100644 index 0000000..35b1fc4 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/popcount.asm @@ -0,0 +1,69 @@ +dnl S/390-64 mpn_popcount + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C z900 - +C z990 - +C z9 - +C z10 - +C z196 - +C z12 ? +C z13 ? +C z14 ? +C z15 ? + +define(`ap', `%r2') +define(`n', `%r3') + +ASM_START() +PROLOGUE(mpn_popcount) + vzero %v30 + tmll n, 1 + srlg n, n, 1 + je L(top) + +L(odd): vllezg %v16, 0(ap) + vpopct %v30, %v16, 3 + la ap, 8(ap) + clgije n, 0, L(end) + +L(top): vl %v16, 0(ap), 3 + vpopct %v20, %v16, 3 + vag %v30, %v30, %v20 + la ap, 16(ap) + brctg n, L(top) + +L(end): vzero %v29 + vsumqg %v30, %v30, %v29 + vlgvg %r2, %v30, 1(%r0) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/s390_64/z13/sqr_basecase.c b/gmp-6.3.0/mpn/s390_64/z13/sqr_basecase.c new file mode 100644 index 0000000..91dc47c --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/sqr_basecase.c @@ -0,0 +1,82 @@ +/* mpn_sqr_basecase -- Internal routine to square a natural number of length n. + This is a place-holder for z13 to suppress the use of the plain z/arch code. + FIXME: This should really be written in assembly with outer-loop early exit. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2017, 2023 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un) +{ + mp_limb_t u0; + mp_limb_t cin; + + u0 = up[0]; + umul_ppmm (cin, rp[0], u0, u0); + ++rp; + + if (--un) { + u0 = u0 << 1; + up += 1; + + rp[un] = mpn_mul_1c (rp, up, un, u0, cin); + + for (;;) { + mp_limb_t ci, x0, c0, hi, lo, x1, c1; + + u0 = up[0]; + ci = -(up[-1] >> (GMP_NUMB_BITS-1)) & u0; // correction term + x0 = rp[1] + ci; + c0 = x0 < ci; + hi, lo; + + umul_ppmm (hi, lo, u0, u0); + x1 = x0 + lo; + c1 = x1 < lo; + cin = hi + c0 + c1; + rp[1] = x1; + rp += 2; + + if (--un == 0) break; + u0 = (up[-1] >> (GMP_NUMB_BITS-1)) + (u0 << 1); + up += 1; + + rp[un] = mpn_addmul_1c (rp, up, un, u0, cin); + } + } + + rp[0] = cin; +} diff --git a/gmp-6.3.0/mpn/s390_64/z13/submul_1.asm b/gmp-6.3.0/mpn/s390_64/z13/submul_1.asm new file mode 100644 index 0000000..64f0628 --- /dev/null +++ b/gmp-6.3.0/mpn/s390_64/z13/submul_1.asm @@ -0,0 +1,168 @@ +dnl S/390-64 mpn_submul_1 +dnl Based on C code contributed by Marius Hillenbrand. + +dnl Copyright 2023 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl TODO +dnl * Schedule vlvgp away from mlgr; that saves 20% of the run time. +dnl * Perhaps use vp[0]/vp[1] in innerloop instead preloading v0/v1. + +C cycles/limb +C z900 - +C z990 - +C z9 - +C z10 - +C z196 - +C z12 ? +C z13 ? +C z14 ? +C z15 2.55 + + +define(`rp', `%r2') +define(`ap', `%r3') +define(`an', `%r4') +define(`b0', `%r5') +define(`cy', `%r6') + +define(`idx', `%r4') + +ASM_START() + +PROLOGUE(mpn_submul_1) + stmg %r6, %r13, 48(%r15) +L(ent): vzero %v0 + vone %v2 + srlg %r11, an, 2 + + tmll an, 1 + je L(bx0) +L(bx1): tmll an, 2 + jne L(b11) + +L(b01): lghi idx, -24 + vleg %v2, 0(rp), 1 + lg %r13, 0(ap) + vzero %v4 + mlgr %r12, b0 + vlvgg %v4, %r13, 1 + vsq %v2, %v2, %v4 + vsteg %v2, 0(rp), 1 + vmrhg %v2, %v2, %v2 + cgije %r11, 0, L(1) + j L(cj0) + +L(b11): lghi idx, -8 + vleg %v2, 0(rp), 1 + lg %r9, 0(ap) + vzero %v4 + mlgr %r8, b0 + vlvgg %v4, %r9, 1 + vsq %v2, %v2, %v4 + vsteg %v2, 0(rp), 1 + vmrhg %v2, %v2, %v2 + j L(cj1) + +L(bx0): tmll an, 2 + jne L(b10) +L(b00): lghi idx, -32 + lghi %r12, 0 +L(cj0): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 + j L(mid) + +L(b10): lghi idx, -16 + lghi %r8, 0 +L(cj1): lg %r7, 16(idx, ap) + lg %r13, 24(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + cgije %r11, 0, L(end) + +L(top): lg %r1, 32(idx, ap) + lg %r9, 40(idx, ap) + mlgr %r0, b0 + mlgr %r8, b0 + vl %v1, 16(idx, rp), 3 + vpdi %v1, %v1, %v1, 4 + vacq %v5, %v6, %v7, %v0 + vacccq %v0, %v6, %v7, %v0 + vsbiq %v3, %v1, %v5, %v2 + vsbcbiq %v2, %v1, %v5, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + vlvgp %v6, %r0, %r1 + vlvgp %v7, %r9, %r12 +L(mid): lg %r7, 48(idx, ap) + lg %r13, 56(idx, ap) + mlgr %r6, b0 + mlgr %r12, b0 + vl %v4, 32(idx, rp), 3 + vpdi %v4, %v4, %v4, 4 + vacq %v5, %v6, %v7, %v0 + vacccq %v0, %v6, %v7, %v0 + vsbiq %v1, %v4, %v5, %v2 + vsbcbiq %v2, %v4, %v5, %v2 + vpdi %v1, %v1, %v1, 4 + vst %v1, 32(idx, rp), 3 + vlvgp %v6, %r6, %r7 + vlvgp %v7, %r13, %r8 + la idx, 32(idx) + brctg %r11, L(top) + +L(end): vl %v1, 16(idx, rp), 3 + vpdi %v1, %v1, %v1, 4 + vacq %v5, %v6, %v7, %v0 + vacccq %v0, %v6, %v7, %v0 + vsbiq %v3, %v1, %v5, %v2 + vsbcbiq %v2, %v1, %v5, %v2 + vpdi %v3, %v3, %v3, 4 + vst %v3, 16(idx, rp), 3 + + vsg %v2, %v0, %v2 + vlgvg %r2, %v2, 1 + algr %r2, %r12 + aghi %r2, 1 + lmg %r6, %r13, 48(%r15) + br %r14 +L(1): vsg %v2, %v0, %v2 + vlgvg %r2, %v2, 1 + algr %r2, %r12 + aghi %r2, -1 + lmg %r6, %r13, 48(%r15) + br %r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sbpi1_bdiv_q.c b/gmp-6.3.0/mpn/sbpi1_bdiv_q.c new file mode 120000 index 0000000..de362a3 --- /dev/null +++ b/gmp-6.3.0/mpn/sbpi1_bdiv_q.c @@ -0,0 +1 @@ +../mpn/generic/sbpi1_bdiv_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sbpi1_bdiv_qr.c b/gmp-6.3.0/mpn/sbpi1_bdiv_qr.c new file mode 120000 index 0000000..a0d6b8e --- /dev/null +++ b/gmp-6.3.0/mpn/sbpi1_bdiv_qr.c @@ -0,0 +1 @@ +../mpn/generic/sbpi1_bdiv_qr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sbpi1_bdiv_r.c b/gmp-6.3.0/mpn/sbpi1_bdiv_r.c new file mode 120000 index 0000000..8af23e7 --- /dev/null +++ b/gmp-6.3.0/mpn/sbpi1_bdiv_r.c @@ -0,0 +1 @@ +../mpn/generic/sbpi1_bdiv_r.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sbpi1_div_q.c b/gmp-6.3.0/mpn/sbpi1_div_q.c new file mode 120000 index 0000000..87ee304 --- /dev/null +++ b/gmp-6.3.0/mpn/sbpi1_div_q.c @@ -0,0 +1 @@ +../mpn/generic/sbpi1_div_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sbpi1_div_qr.c b/gmp-6.3.0/mpn/sbpi1_div_qr.c new file mode 120000 index 0000000..0de92a8 --- /dev/null +++ b/gmp-6.3.0/mpn/sbpi1_div_qr.c @@ -0,0 +1 @@ +../mpn/generic/sbpi1_div_qr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sbpi1_divappr_q.c b/gmp-6.3.0/mpn/sbpi1_divappr_q.c new file mode 120000 index 0000000..49f2346 --- /dev/null +++ b/gmp-6.3.0/mpn/sbpi1_divappr_q.c @@ -0,0 +1 @@ +../mpn/generic/sbpi1_divappr_q.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/scan0.c b/gmp-6.3.0/mpn/scan0.c new file mode 120000 index 0000000..4b2f7b1 --- /dev/null +++ b/gmp-6.3.0/mpn/scan0.c @@ -0,0 +1 @@ +../mpn/generic/scan0.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/scan1.c b/gmp-6.3.0/mpn/scan1.c new file mode 120000 index 0000000..3f51890 --- /dev/null +++ b/gmp-6.3.0/mpn/scan1.c @@ -0,0 +1 @@ +../mpn/generic/scan1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_add_1.c b/gmp-6.3.0/mpn/sec_add_1.c new file mode 120000 index 0000000..5fde851 --- /dev/null +++ b/gmp-6.3.0/mpn/sec_add_1.c @@ -0,0 +1 @@ +../mpn/generic/sec_aors_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_div_qr.c b/gmp-6.3.0/mpn/sec_div_qr.c new file mode 120000 index 0000000..7e1f1b2 --- /dev/null +++ b/gmp-6.3.0/mpn/sec_div_qr.c @@ -0,0 +1 @@ +../mpn/generic/sec_div.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_div_r.c b/gmp-6.3.0/mpn/sec_div_r.c new file mode 120000 index 0000000..7e1f1b2 --- /dev/null +++ b/gmp-6.3.0/mpn/sec_div_r.c @@ -0,0 +1 @@ +../mpn/generic/sec_div.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_invert.c b/gmp-6.3.0/mpn/sec_invert.c new file mode 120000 index 0000000..66841bf --- /dev/null +++ b/gmp-6.3.0/mpn/sec_invert.c @@ -0,0 +1 @@ +../mpn/generic/sec_invert.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_mul.c b/gmp-6.3.0/mpn/sec_mul.c new file mode 120000 index 0000000..9a72c93 --- /dev/null +++ b/gmp-6.3.0/mpn/sec_mul.c @@ -0,0 +1 @@ +../mpn/generic/sec_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_pi1_div_qr.c b/gmp-6.3.0/mpn/sec_pi1_div_qr.c new file mode 120000 index 0000000..920657a --- /dev/null +++ b/gmp-6.3.0/mpn/sec_pi1_div_qr.c @@ -0,0 +1 @@ +../mpn/generic/sec_pi1_div.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_pi1_div_r.c b/gmp-6.3.0/mpn/sec_pi1_div_r.c new file mode 120000 index 0000000..920657a --- /dev/null +++ b/gmp-6.3.0/mpn/sec_pi1_div_r.c @@ -0,0 +1 @@ +../mpn/generic/sec_pi1_div.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_powm.c b/gmp-6.3.0/mpn/sec_powm.c new file mode 120000 index 0000000..e00403c --- /dev/null +++ b/gmp-6.3.0/mpn/sec_powm.c @@ -0,0 +1 @@ +../mpn/generic/sec_powm.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_sqr.c b/gmp-6.3.0/mpn/sec_sqr.c new file mode 120000 index 0000000..4becde2 --- /dev/null +++ b/gmp-6.3.0/mpn/sec_sqr.c @@ -0,0 +1 @@ +../mpn/generic/sec_sqr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_sub_1.c b/gmp-6.3.0/mpn/sec_sub_1.c new file mode 120000 index 0000000..5fde851 --- /dev/null +++ b/gmp-6.3.0/mpn/sec_sub_1.c @@ -0,0 +1 @@ +../mpn/generic/sec_aors_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sec_tabselect.asm b/gmp-6.3.0/mpn/sec_tabselect.asm new file mode 120000 index 0000000..796e3b1 --- /dev/null +++ b/gmp-6.3.0/mpn/sec_tabselect.asm @@ -0,0 +1 @@ +../mpn/x86/mmx/sec_tabselect.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/set_str.c b/gmp-6.3.0/mpn/set_str.c new file mode 120000 index 0000000..9386455 --- /dev/null +++ b/gmp-6.3.0/mpn/set_str.c @@ -0,0 +1 @@ +../mpn/generic/set_str.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sh/add_n.asm b/gmp-6.3.0/mpn/sh/add_n.asm new file mode 100644 index 0000000..79d17d0 --- /dev/null +++ b/gmp-6.3.0/mpn/sh/add_n.asm @@ -0,0 +1,59 @@ +dnl SH mpn_add_n -- Add two limb vectors of the same length > 0 and store sum +dnl in a third limb vector. + +dnl Copyright 1995, 1997, 2000, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rp r4 +C up r5 +C vp r6 +C n r7 + +changecom(blah) C disable # to make all C comments below work + +ASM_START() +PROLOGUE(mpn_add_n) + mov #0,r3 C clear cy save reg + +L(top): mov.l @r5+,r1 + mov.l @r6+,r2 + shlr r3 C restore cy + addc r2,r1 + movt r3 C save cy + mov.l r1,@r4 + dt r7 + bf.s L(top) + add #4,r4 + + rts + mov r3,r0 C return carry-out from most significant limb +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sh/sh2/addmul_1.asm b/gmp-6.3.0/mpn/sh/sh2/addmul_1.asm new file mode 100644 index 0000000..c914b29 --- /dev/null +++ b/gmp-6.3.0/mpn/sh/sh2/addmul_1.asm @@ -0,0 +1,65 @@ +dnl SH2 mpn_addmul_1 -- Multiply a limb vector with a limb and add the result +dnl to a second limb vector. + +dnl Copyright 1995, 2000, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r4 +C s1_ptr r5 +C size r6 +C s2_limb r7 + +changecom(blah) C disable # to make all C comments below work + +ASM_START() +PROLOGUE(mpn_addmul_1) + mov #0,r2 C cy_limb = 0 + mov #0,r0 C Keep r0 = 0 for entire loop + clrt + +L(top): mov.l @r5+,r3 + dmulu.l r3,r7 + sts macl,r1 + addc r2,r1 C lo_prod += old cy_limb + sts mach,r2 C new cy_limb = hi_prod + mov.l @r4,r3 + addc r0,r2 C cy_limb += T, T = 0 + addc r3,r1 + addc r0,r2 C cy_limb += T, T = 0 + dt r6 + mov.l r1,@r4 + bf.s L(top) + add #4,r4 + + rts + mov r2,r0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sh/sh2/mul_1.asm b/gmp-6.3.0/mpn/sh/sh2/mul_1.asm new file mode 100644 index 0000000..83548a6 --- /dev/null +++ b/gmp-6.3.0/mpn/sh/sh2/mul_1.asm @@ -0,0 +1,62 @@ +dnl SH2 mpn_mul_1 -- Multiply a limb vector with a limb and store the result +dnl in a second limb vector. + +dnl Copyright 1995, 2000, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r4 +C s1_ptr r5 +C size r6 +C s2_limb r7 + +changecom(blah) C disable # to make all C comments below work + +ASM_START() +PROLOGUE(mpn_mul_1) + mov #0,r2 C cy_limb = 0 + mov #0,r0 C Keep r0 = 0 for entire loop + clrt + +L(top): mov.l @r5+,r3 + dmulu.l r3,r7 + sts macl,r1 + addc r2,r1 + sts mach,r2 + addc r0,r2 C propagate carry to cy_limb (dt clobbers T) + dt r6 + mov.l r1,@r4 + bf.s L(top) + add #4,r4 + + rts + mov r2,r0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sh/sh2/submul_1.asm b/gmp-6.3.0/mpn/sh/sh2/submul_1.asm new file mode 100644 index 0000000..bef2abd --- /dev/null +++ b/gmp-6.3.0/mpn/sh/sh2/submul_1.asm @@ -0,0 +1,65 @@ +dnl SH2 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the +dnl result from a second limb vector. + +dnl Copyright 1995, 2000, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r4 +C s1_ptr r5 +C size r6 +C s2_limb r7 + +changecom(blah) C disable # to make all C comments below work + +ASM_START() +PROLOGUE(mpn_submul_1) + mov #0,r2 C cy_limb = 0 + mov #0,r0 C Keep r0 = 0 for entire loop + clrt + +L(top): mov.l @r5+,r3 + dmulu.l r3,r7 + sts macl,r1 + addc r2,r1 C lo_prod += old cy_limb + sts mach,r2 C new cy_limb = hi_prod + mov.l @r4,r3 + addc r0,r2 C cy_limb += T, T = 0 + subc r1,r3 + addc r0,r2 C cy_limb += T, T = 0 + dt r6 + mov.l r3,@r4 + bf.s L(top) + add #4,r4 + + rts + mov r2,r0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sh/sub_n.asm b/gmp-6.3.0/mpn/sh/sub_n.asm new file mode 100644 index 0000000..465bc80 --- /dev/null +++ b/gmp-6.3.0/mpn/sh/sub_n.asm @@ -0,0 +1,59 @@ +dnl SH mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store +dnl difference in a third limb vector. + +dnl Copyright 1995, 1997, 2000, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rp r4 +C up r5 +C vp r6 +C n r7 + +changecom(blah) C disable # to make all C comments below work + +ASM_START() +PROLOGUE(mpn_sub_n) + mov #0,r3 C clear cy save reg + +L(top): mov.l @r5+,r1 + mov.l @r6+,r2 + shlr r3 C restore cy + subc r2,r1 + movt r3 C save cy + mov.l r1,@r4 + dt r7 + bf.s L(top) + add #4,r4 + + rts + mov r3,r0 C return carry-out from most significant limb +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sizeinbase.c b/gmp-6.3.0/mpn/sizeinbase.c new file mode 120000 index 0000000..c6a7cd8 --- /dev/null +++ b/gmp-6.3.0/mpn/sizeinbase.c @@ -0,0 +1 @@ +../mpn/generic/sizeinbase.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sparc32/README b/gmp-6.3.0/mpn/sparc32/README new file mode 100644 index 0000000..f2dd116 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/README @@ -0,0 +1,71 @@ +Copyright 1996, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions for various SPARC chips. Code that +runs only on version 8 SPARC implementations, is in the v8 subdirectory. + +RELEVANT OPTIMIZATION ISSUES + + Load and Store timing + +On most early SPARC implementations, the ST instructions takes multiple +cycles, while a STD takes just a single cycle more than an ST. For the CPUs +in SPARCstation I and II, the times are 3 and 4 cycles, respectively. +Therefore, combining two ST instructions into a STD when possible is a +significant optimization. + +Later SPARC implementations have single cycle ST. + +For SuperSPARC, we can perform just one memory instruction per cycle, even +if up to two integer instructions can be executed in its pipeline. For +programs that perform so many memory operations that there are not enough +non-memory operations to issue in parallel with all memory operations, using +LDD and STD when possible helps. + +UltraSPARC-1/2 has very slow integer multiplication. In the v9 subdirectory, +we therefore use floating-point multiplication. + +STATUS + +1. On a SuperSPARC, mpn_lshift and mpn_rshift run at 3 cycles/limb, or 2.5 + cycles/limb asymptotically. We could optimize speed for special counts + by using ADDXCC. + +2. On a SuperSPARC, mpn_add_n and mpn_sub_n runs at 2.5 cycles/limb, or 2 + cycles/limb asymptotically. + +3. mpn_mul_1 runs at what is believed to be optimal speed. + +4. On SuperSPARC, mpn_addmul_1 and mpn_submul_1 could both be improved by a + cycle by avoiding one of the add instructions. See a29k/addmul_1. + +The speed of the code for other SPARC implementations is uncertain. diff --git a/gmp-6.3.0/mpn/sparc32/add_n.asm b/gmp-6.3.0/mpn/sparc32/add_n.asm new file mode 100644 index 0000000..8549195 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/add_n.asm @@ -0,0 +1,245 @@ +dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(res_ptr,%o0) +define(s1_ptr,%o1) +define(s2_ptr,%o2) +define(n,%o3) + +ASM_START() +PROLOGUE(mpn_add_n) + xor s2_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(1) C branch if alignment differs + nop +C ** V1a ** +L(0): andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + addcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s1_ptr+0],%g4 + addcc n,-10,n + ld [s1_ptr+4],%g1 + ldd [s2_ptr+0],%g2 + blt L(fin1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1): + addxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addxcc %g4,%g2,%o4 + ld [s1_ptr+16],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+20],%g1 + ldd [s2_ptr+16],%g2 + std %o4,[res_ptr+8] + addxcc %g4,%g2,%o4 + ld [s1_ptr+24],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+28],%g1 + ldd [s2_ptr+24],%g2 + std %o4,[res_ptr+16] + addxcc %g4,%g2,%o4 + ld [s1_ptr+32],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+36],%g1 + ldd [s2_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1) + subcc %g0,%o4,%g0 C restore cy + +L(fin1): + addcc n,8-2,n + blt L(end1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1): + addxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1) + subcc %g0,%o4,%g0 C restore cy +L(end1): + addxcc %g4,%g2,%o4 + addxcc %g1,%g3,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s1_ptr+8],%g4 + ld [s2_ptr+8],%g2 + addxcc %g4,%g2,%o4 + st %o4,[res_ptr+8] + +L(ret1): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +L(1): xor s1_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(2) + nop +C ** V1b ** + mov s2_ptr,%g1 + mov s1_ptr,s2_ptr + b L(0) + mov %g1,s1_ptr + +C ** V2 ** +C If we come here, the alignment of s1_ptr and res_ptr as well as the +C alignment of s2_ptr and res_ptr differ. Since there are only two ways +C things can be aligned (that we care about) we now know that the alignment +C of s1_ptr and s2_ptr are the same. + +L(2): cmp n,1 + be L(jone) + nop + andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0 + be L(v2) C if no, branch + nop +C Add least significant limb separately to align s1_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + addcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr + +L(v2): addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + blt L(fin2) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + ldd [s1_ptr+8],%g2 + ldd [s2_ptr+8],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+8] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+12] + ldd [s1_ptr+16],%g2 + ldd [s2_ptr+16],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+16] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+20] + ldd [s1_ptr+24],%g2 + ldd [s2_ptr+24],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+24] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+28] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop2) + subcc %g0,%o4,%g0 C restore cy + +L(fin2): + addcc n,8-2,n + blt L(end2) + subcc %g0,%o4,%g0 C restore cy +L(loope2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope2) + subcc %g0,%o4,%g0 C restore cy +L(end2): + andcc n,1,%g0 + be L(ret2) + subcc %g0,%o4,%g0 C restore cy +C Add last limb +L(jone): + ld [s1_ptr],%g4 + ld [s2_ptr],%g2 + addxcc %g4,%g2,%o4 + st %o4,[res_ptr] + +L(ret2): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb +EPILOGUE(mpn_add_n) diff --git a/gmp-6.3.0/mpn/sparc32/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/addmul_1.asm new file mode 100644 index 0000000..92d5d78 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/addmul_1.asm @@ -0,0 +1,155 @@ +dnl SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_addmul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + addcc %o5,%g1,%g1 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne L(loop0) + ld [%o4+%o2],%o5 + + addcc %o5,%g1,%g1 + addx %o0,%g0,%o0 + retl + st %g1,[%o4+%o2] + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + addcc %o5,%g3,%g3 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 + addcc %o2,4,%o2 + bne L(loop) + ld [%o4+%o2],%o5 + + addcc %o5,%g3,%g3 + addx %o0,%g0,%o0 + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/sparc32/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/gmp-mparam.h new file mode 100644 index 0000000..a3bc612 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/gmp-mparam.h @@ -0,0 +1,67 @@ +/* SPARC v7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* Generated by tuneup.c, 2002-03-13, gcc 2.95, Weitek 8701 */ + +#define MUL_TOOM22_THRESHOLD 8 +#define MUL_TOOM33_THRESHOLD 466 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 16 +#define SQR_TOOM3_THRESHOLD 258 + +#define DIV_SB_PREINV_THRESHOLD 4 +#define DIV_DC_THRESHOLD 28 +#define POWM_THRESHOLD 28 + +#define GCD_ACCEL_THRESHOLD 3 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 120 +#define MODEXACT_1_ODD_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define GET_STR_DC_THRESHOLD 21 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_THRESHOLD 1012 + +#define MUL_FFT_TABLE { 272, 672, 1152, 3584, 10240, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 264 +#define MUL_FFT_THRESHOLD 2304 + +#define SQR_FFT_TABLE { 304, 736, 1152, 3584, 10240, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 248 +#define SQR_FFT_THRESHOLD 2304 diff --git a/gmp-6.3.0/mpn/sparc32/lshift.asm b/gmp-6.3.0/mpn/sparc32/lshift.asm new file mode 100644 index 0000000..8321343 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/lshift.asm @@ -0,0 +1,105 @@ +dnl SPARC mpn_lshift -- Shift a number left. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr %o0 +C src_ptr %o1 +C size %o2 +C cnt %o3 + +ASM_START() +PROLOGUE(mpn_lshift) + sll %o2,2,%g1 + add %o1,%g1,%o1 C make %o1 point at end of src + ld [%o1-4],%g2 C load first limb + sub %g0,%o3,%o5 C negate shift count + add %o0,%g1,%o0 C make %o0 point at end of res + add %o2,-1,%o2 + andcc %o2,4-1,%g4 C number of limbs in first loop + srl %g2,%o5,%g1 C compute function result + be L(0) C if multiple of 4 limbs, skip first loop + st %g1,[%sp+80] + + sub %o2,%g4,%o2 C adjust count for main loop + +L(loop0): + ld [%o1-8],%g3 + add %o0,-4,%o0 + add %o1,-4,%o1 + addcc %g4,-1,%g4 + sll %g2,%o3,%o4 + srl %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + bne L(loop0) + st %o4,[%o0+0] + +L(0): tst %o2 + be L(end) + nop + +L(loop): + ld [%o1-8],%g3 + add %o0,-16,%o0 + addcc %o2,-4,%o2 + sll %g2,%o3,%o4 + srl %g3,%o5,%g1 + + ld [%o1-12],%g2 + sll %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0+12] + srl %g2,%o5,%g1 + + ld [%o1-16],%g3 + sll %g2,%o3,%o4 + or %g4,%g1,%g4 + st %g4,[%o0+8] + srl %g3,%o5,%g1 + + ld [%o1-20],%g2 + sll %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0+4] + srl %g2,%o5,%g1 + + add %o1,-16,%o1 + or %g4,%g1,%g4 + bne L(loop) + st %g4,[%o0+0] + +L(end): sll %g2,%o3,%g2 + st %g2,[%o0-4] + retl + ld [%sp+80],%o0 +EPILOGUE(mpn_lshift) diff --git a/gmp-6.3.0/mpn/sparc32/mul_1.asm b/gmp-6.3.0/mpn/sparc32/mul_1.asm new file mode 100644 index 0000000..42b4168 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/mul_1.asm @@ -0,0 +1,146 @@ +dnl SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_mul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne,a L(loop0) + ld [%o1+%o2],%o5 + + retl + st %g1,[%o4+%o2] + + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 C g2 = S1_LIMB iff S2_LIMB < 0, else 0 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne,a L(loop) + ld [%o1+%o2],%o5 + + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/sparc32/rshift.asm b/gmp-6.3.0/mpn/sparc32/rshift.asm new file mode 100644 index 0000000..e155476 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/rshift.asm @@ -0,0 +1,102 @@ +dnl SPARC mpn_rshift -- Shift a number right. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr %o0 +C src_ptr %o1 +C size %o2 +C cnt %o3 + +ASM_START() +PROLOGUE(mpn_rshift) + ld [%o1],%g2 C load first limb + sub %g0,%o3,%o5 C negate shift count + add %o2,-1,%o2 + andcc %o2,4-1,%g4 C number of limbs in first loop + sll %g2,%o5,%g1 C compute function result + be L(0) C if multiple of 4 limbs, skip first loop + st %g1,[%sp+80] + + sub %o2,%g4,%o2 C adjust count for main loop + +L(loop0): + ld [%o1+4],%g3 + add %o0,4,%o0 + add %o1,4,%o1 + addcc %g4,-1,%g4 + srl %g2,%o3,%o4 + sll %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + bne L(loop0) + st %o4,[%o0-4] + +L(0): tst %o2 + be L(end) + nop + +L(loop): + ld [%o1+4],%g3 + add %o0,16,%o0 + addcc %o2,-4,%o2 + srl %g2,%o3,%o4 + sll %g3,%o5,%g1 + + ld [%o1+8],%g2 + srl %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0-16] + sll %g2,%o5,%g1 + + ld [%o1+12],%g3 + srl %g2,%o3,%o4 + or %g4,%g1,%g4 + st %g4,[%o0-12] + sll %g3,%o5,%g1 + + ld [%o1+16],%g2 + srl %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0-8] + sll %g2,%o5,%g1 + + add %o1,16,%o1 + or %g4,%g1,%g4 + bne L(loop) + st %g4,[%o0-4] + +L(end): srl %g2,%o3,%g2 + st %g2,[%o0-0] + retl + ld [%sp+80],%o0 +EPILOGUE(mpn_rshift) diff --git a/gmp-6.3.0/mpn/sparc32/sparc-defs.m4 b/gmp-6.3.0/mpn/sparc32/sparc-defs.m4 new file mode 100644 index 0000000..fff0ff8 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/sparc-defs.m4 @@ -0,0 +1,97 @@ +divert(-1) + +dnl m4 macros for SPARC assembler (32 and 64 bit). + + +dnl Copyright 2002, 2011, 2013, 2017, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +changecom(;) dnl cannot use default # since that's used in REGISTER decls + + +dnl Usage: REGISTER(reg,attr) +dnl +dnl Give a ".register reg,attr" directive, if the assembler supports it. +dnl HAVE_REGISTER comes from the GMP_ASM_SPARC_REGISTER configure test. + +define(REGISTER, +m4_assert_numargs(2) +m4_assert_defined(`HAVE_REGISTER') +`ifelse(HAVE_REGISTER,yes, +`.register `$1',`$2'')') + + +C Testing mechanism for running newer code on older processors +ifdef(`FAKE_T3',` + include_mpn(`sparc64/ultrasparct3/missing.m4') +',` + define(`addxccc', ``addxccc' $1, $2, $3') + define(`addxc', ``addxc' $1, $2, $3') + define(`umulxhi', ``umulxhi' $1, $2, $3') + define(`lzcnt', ``lzd' $1, $2') +') + +dnl Usage: LEA64(symbol,reg,pic_reg) +dnl +dnl Use whatever 64-bit code sequence is appropriate to load "symbol" into +dnl register "reg", potentially using register "pic_reg" to perform the +dnl calculations. +dnl +dnl Caveat: We used to use the setx pseudo insn here, but some GNU/Linux +dnl releases causes invalid code or relocs for that. +dnl +dnl Optimisation 1: Use thunk call instead of RDPC which causes pipeline +dnl replay for some sparcs. +dnl +dnl Optimisation 2: Do the two symbol building sequences in parallel instead +dnl of one after the other. That might need one more scratch register. + +define(LEA64, +m4_assert_numargs(3) +`ifdef(`PIC',` + rd %pc, %`$2' + sethi %hi(_GLOBAL_OFFSET_TABLE_+4), %`$3' + add %`$3', %lo(_GLOBAL_OFFSET_TABLE_+8), %`$3' + add %`$2', %`$3', %`$3' +ifelse(HAVE_GDOP,yes,` + sethi %gdop_hix22(`$1'), %`$2' + xor %`$2', %gdop_lox10(`$1'), %`$2' + ldx [%`$3' + %`$2'], %`$2', %gdop(`$1') +',` + sethi %hi(`$1'), %`$2' + or %`$2', %lo(`$1'), %`$2' + ldx [%`$3' + %`$2'], %`$2' +')',` + sethi %h44(`$1'), %`$2' + or %`$2', %m44(`$1'), %`$2' + sllx %`$2', 12, %`$2' + or %`$2', %l44(`$1'), %$2 +')') + +divert diff --git a/gmp-6.3.0/mpn/sparc32/sub_n.asm b/gmp-6.3.0/mpn/sparc32/sub_n.asm new file mode 100644 index 0000000..24a576d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/sub_n.asm @@ -0,0 +1,335 @@ +dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(res_ptr,%o0) +define(s1_ptr,%o1) +define(s2_ptr,%o2) +define(n,%o3) + +ASM_START() +PROLOGUE(mpn_sub_n) + xor s2_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(1) C branch if alignment differs + nop +C ** V1a ** + andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + subcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s1_ptr+0],%g4 + addcc n,-10,n + ld [s1_ptr+4],%g1 + ldd [s2_ptr+0],%g2 + blt L(fin1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1): + subxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + subxcc %g4,%g2,%o4 + ld [s1_ptr+16],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+20],%g1 + ldd [s2_ptr+16],%g2 + std %o4,[res_ptr+8] + subxcc %g4,%g2,%o4 + ld [s1_ptr+24],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+28],%g1 + ldd [s2_ptr+24],%g2 + std %o4,[res_ptr+16] + subxcc %g4,%g2,%o4 + ld [s1_ptr+32],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+36],%g1 + ldd [s2_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1) + subcc %g0,%o4,%g0 C restore cy + +L(fin1): + addcc n,8-2,n + blt L(end1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1): + subxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1) + subcc %g0,%o4,%g0 C restore cy +L(end1): + subxcc %g4,%g2,%o4 + subxcc %g1,%g3,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s1_ptr+8],%g4 + ld [s2_ptr+8],%g2 + subxcc %g4,%g2,%o4 + st %o4,[res_ptr+8] + +L(ret1): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +L(1): xor s1_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(2) + nop +C ** V1b ** + andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1b) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s1_ptr + ld [s2_ptr],%g4 + add s2_ptr,4,s2_ptr + ld [s1_ptr],%g2 + add s1_ptr,4,s1_ptr + add n,-1,n + subcc %g2,%g4,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1b): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s2_ptr+0],%g4 + addcc n,-10,n + ld [s2_ptr+4],%g1 + ldd [s1_ptr+0],%g2 + blt L(fin1b) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1b): + subxcc %g2,%g4,%o4 + ld [s2_ptr+8],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+12],%g1 + ldd [s1_ptr+8],%g2 + std %o4,[res_ptr+0] + subxcc %g2,%g4,%o4 + ld [s2_ptr+16],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+20],%g1 + ldd [s1_ptr+16],%g2 + std %o4,[res_ptr+8] + subxcc %g2,%g4,%o4 + ld [s2_ptr+24],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+28],%g1 + ldd [s1_ptr+24],%g2 + std %o4,[res_ptr+16] + subxcc %g2,%g4,%o4 + ld [s2_ptr+32],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+36],%g1 + ldd [s1_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1b) + subcc %g0,%o4,%g0 C restore cy + +L(fin1b): + addcc n,8-2,n + blt L(end1b) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1b): + subxcc %g2,%g4,%o4 + ld [s2_ptr+8],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+12],%g1 + ldd [s1_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1b) + subcc %g0,%o4,%g0 C restore cy +L(end1b): + subxcc %g2,%g4,%o4 + subxcc %g3,%g1,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1b) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s2_ptr+8],%g4 + ld [s1_ptr+8],%g2 + subxcc %g2,%g4,%o4 + st %o4,[res_ptr+8] + +L(ret1b): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +C ** V2 ** +C If we come here, the alignment of s1_ptr and res_ptr as well as the +C alignment of s2_ptr and res_ptr differ. Since there are only two ways +C things can be aligned (that we care about) we now know that the alignment +C of s1_ptr and s2_ptr are the same. + +L(2): cmp n,1 + be L(jone) + nop + andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0 + be L(v2) C if no, branch + nop +C Add least significant limb separately to align s1_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + subcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr + +L(v2): addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + blt L(fin2) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + ldd [s1_ptr+8],%g2 + ldd [s2_ptr+8],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+8] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+12] + ldd [s1_ptr+16],%g2 + ldd [s2_ptr+16],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+16] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+20] + ldd [s1_ptr+24],%g2 + ldd [s2_ptr+24],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+24] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+28] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop2) + subcc %g0,%o4,%g0 C restore cy + +L(fin2): + addcc n,8-2,n + blt L(end2) + subcc %g0,%o4,%g0 C restore cy +L(loope2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope2) + subcc %g0,%o4,%g0 C restore cy +L(end2): + andcc n,1,%g0 + be L(ret2) + subcc %g0,%o4,%g0 C restore cy +C Add last limb +L(jone): + ld [s1_ptr],%g4 + ld [s2_ptr],%g2 + subxcc %g4,%g2,%o4 + st %o4,[res_ptr] + +L(ret2): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb +EPILOGUE(mpn_sub_n) diff --git a/gmp-6.3.0/mpn/sparc32/submul_1.asm b/gmp-6.3.0/mpn/sparc32/submul_1.asm new file mode 100644 index 0000000..73f9377 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/submul_1.asm @@ -0,0 +1,155 @@ +dnl SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_submul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + subcc %o5,%g1,%g1 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne L(loop0) + ld [%o4+%o2],%o5 + + subcc %o5,%g1,%g1 + addx %o0,%g0,%o0 + retl + st %g1,[%o4+%o2] + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + subcc %o5,%g3,%g3 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 + addcc %o2,4,%o2 + bne L(loop) + ld [%o4+%o2],%o5 + + subcc %o5,%g3,%g3 + addx %o0,%g0,%o0 + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/sparc32/udiv.asm b/gmp-6.3.0/mpn/sparc32/udiv.asm new file mode 100644 index 0000000..cbc24b1 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/udiv.asm @@ -0,0 +1,147 @@ +dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h. +dnl This is for v7 CPUs with a floating-point unit. + +dnl Copyright 1993, 1994, 1996, 2000, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + sethi %hi(0x80000000),%g1 + + sethi %hi(0x41e00000),%i4 + mov 0,%i5 + std %i4,[%fp-8] + ldd [%fp-8],%f12 C 0r2147483648 + faddd %f12,%f12,%f8 C 0r4294967296 + + mov %i0,%i5 + + sub %i1,%g1,%l0 + sub %i2,%g1,%l1 + std %l0,[%fp-8] + ldd [%fp-8],%f10 + + fitod %f10,%f4 + faddd %f4,%f12,%f4 + + fitod %f11,%f2 + faddd %f2,%f12,%f2 + + fmuld %f4,%f8,%f6 + + sub %i3,%g1,%l2 + st %l2,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + fitod %f10,%f4 + faddd %f4,%f12,%f4 + + fdivd %f2,%f4,%f2 + fcmped %f2,%f12 + nop + fbge,a L(1) + fsubd %f2,%f12,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(2) + ld [%fp-8],%i4 +L(1): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + add %i4,%g1,%i4 +L(2): + wr %g0,%i4,%y + sra %i3,31,%g2 + and %i4,%g2,%g2 + andcc %g0,0,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,0,%g1 + add %g1,%g2,%i0 + rd %y,%g3 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(3) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(3): + blu L(4) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(4): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc32/udiv_nfp.asm b/gmp-6.3.0/mpn/sparc32/udiv_nfp.asm new file mode 100644 index 0000000..ebbb820 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/udiv_nfp.asm @@ -0,0 +1,202 @@ +dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h. +dnl This is for v7 CPUs without a floating-point unit. + +dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr o0 +C n1 o1 +C n0 o2 +C d o3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + tst %o3 + bneg L(largedivisor) + mov 8,%g1 + + b L(p1) + addxcc %o2,%o2,%o2 + +L(plop): + bcc L(n1) + addxcc %o2,%o2,%o2 +L(p1): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n2) + addxcc %o2,%o2,%o2 +L(p2): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n3) + addxcc %o2,%o2,%o2 +L(p3): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n4) + addxcc %o2,%o2,%o2 +L(p4): addx %o1,%o1,%o1 + addcc %g1,-1,%g1 + bne L(plop) + subcc %o1,%o3,%o4 + bcc L(n5) + addxcc %o2,%o2,%o2 +L(p5): st %o1,[%o0] + retl + xnor %g0,%o2,%o0 + +L(nlop): + bcc L(p1) + addxcc %o2,%o2,%o2 +L(n1): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p2) + addxcc %o2,%o2,%o2 +L(n2): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p3) + addxcc %o2,%o2,%o2 +L(n3): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p4) + addxcc %o2,%o2,%o2 +L(n4): addx %o4,%o4,%o4 + addcc %g1,-1,%g1 + bne L(nlop) + subcc %o4,%o3,%o1 + bcc L(p5) + addxcc %o2,%o2,%o2 +L(n5): st %o4,[%o0] + retl + xnor %g0,%o2,%o0 + +L(largedivisor): + and %o2,1,%o5 C %o5 = n0 & 1 + + srl %o2,1,%o2 + sll %o1,31,%g2 + or %g2,%o2,%o2 C %o2 = lo(n1n0 >> 1) + srl %o1,1,%o1 C %o1 = hi(n1n0 >> 1) + + and %o3,1,%g2 + srl %o3,1,%g3 C %g3 = floor(d / 2) + add %g3,%g2,%g3 C %g3 = ceil(d / 2) + + b L(Lp1) + addxcc %o2,%o2,%o2 + +L(Lplop): + bcc L(Ln1) + addxcc %o2,%o2,%o2 +L(Lp1): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln2) + addxcc %o2,%o2,%o2 +L(Lp2): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln3) + addxcc %o2,%o2,%o2 +L(Lp3): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln4) + addxcc %o2,%o2,%o2 +L(Lp4): addx %o1,%o1,%o1 + addcc %g1,-1,%g1 + bne L(Lplop) + subcc %o1,%g3,%o4 + bcc L(Ln5) + addxcc %o2,%o2,%o2 +L(Lp5): add %o1,%o1,%o1 C << 1 + tst %g2 + bne L(oddp) + add %o5,%o1,%o1 + st %o1,[%o0] + retl + xnor %g0,%o2,%o0 + +L(Lnlop): + bcc L(Lp1) + addxcc %o2,%o2,%o2 +L(Ln1): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp2) + addxcc %o2,%o2,%o2 +L(Ln2): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp3) + addxcc %o2,%o2,%o2 +L(Ln3): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp4) + addxcc %o2,%o2,%o2 +L(Ln4): addx %o4,%o4,%o4 + addcc %g1,-1,%g1 + bne L(Lnlop) + subcc %o4,%g3,%o1 + bcc L(Lp5) + addxcc %o2,%o2,%o2 +L(Ln5): add %o4,%o4,%o4 C << 1 + tst %g2 + bne L(oddn) + add %o5,%o4,%o4 + st %o4,[%o0] + retl + xnor %g0,%o2,%o0 + +L(oddp): + xnor %g0,%o2,%o2 + C q' in %o2. r' in %o1 + addcc %o1,%o2,%o1 + bcc L(Lp6) + addx %o2,0,%o2 + sub %o1,%o3,%o1 +L(Lp6): subcc %o1,%o3,%g0 + bcs L(Lp7) + subx %o2,-1,%o2 + sub %o1,%o3,%o1 +L(Lp7): st %o1,[%o0] + retl + mov %o2,%o0 + +L(oddn): + xnor %g0,%o2,%o2 + C q' in %o2. r' in %o4 + addcc %o4,%o2,%o4 + bcc L(Ln6) + addx %o2,0,%o2 + sub %o4,%o3,%o4 +L(Ln6): subcc %o4,%o3,%g0 + bcs L(Ln7) + subx %o2,-1,%o2 + sub %o4,%o3,%o4 +L(Ln7): st %o4,[%o0] + retl + mov %o2,%o0 +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm new file mode 100644 index 0000000..c781596 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/add_n.asm @@ -0,0 +1,70 @@ +dnl SPARC T1 32-bit mpn_add_n. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', %o0) +define(`ap', %o1) +define(`bp', %o2) +define(`n', %o3) +define(`cy', %o4) + +define(`i', %o3) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc) + +ASM_START() +PROLOGUE(mpn_add_nc) + b L(ent) + srl cy, 0, cy C strip any bogus high bits +EPILOGUE() + +PROLOGUE(mpn_add_n) + mov 0, cy +L(ent): srl n, 0, n C strip any bogus high bits + sll n, 2, n + add ap, n, ap + add bp, n, bp + add rp, n, rp + neg n, i + +L(top): lduw [ap+i], %g1 + lduw [bp+i], %g2 + add %g1, %g2, %g3 + add %g3, cy, %g3 + stw %g3, [rp+i] + add i, 4, i + brnz i, L(top) + srlx %g3, 32, cy + + retl + mov cy, %o0 C return value +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm new file mode 100644 index 0000000..89da186 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/addmul_1.asm @@ -0,0 +1,90 @@ +dnl SPARC T1 32-bit mpn_addmul_1. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 24 +C UltraSPARC T2: 19 +C UltraSPARC T3: 19 +C UltraSPARC T4: 5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() +PROLOGUE(mpn_addmul_1) + save %sp, -96, %sp + srl n, 0, %o4 + srl v0, 0, %g1 + subcc %o4, 1, %o4 + be L(final_one) + clr %o5 + +L(top): lduw [up+0], %l0 + lduw [rp+0], %l2 + lduw [up+4], %l1 + lduw [rp+4], %l3 + mulx %l0, %g1, %g3 + add up, 8, up + mulx %l1, %g1, %o3 + sub %o4, 2, %o4 + add rp, 8, rp + add %l2, %g3, %g3 + add %o5, %g3, %g3 + stw %g3, [rp-8] + srlx %g3, 32, %o5 + add %l3, %o3, %o3 + add %o5, %o3, %o3 + stw %o3, [rp-4] + brgz %o4, L(top) + srlx %o3, 32, %o5 + + brlz,pt %o4, L(done) + nop + +L(final_one): + lduw [up+0], %l0 + lduw [rp+0], %l2 + mulx %l0, %g1, %g3 + add %l2, %g3, %g3 + add %o5, %g3, %g3 + stw %g3, [rp+0] + srlx %g3, 32, %o5 + +L(done): + ret + restore %o5, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h new file mode 100644 index 0000000..6f9d5a4 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/gmp-mparam.h @@ -0,0 +1,153 @@ +/* UltraSPARC T 32-bit gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 35 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 98 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 226 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 139 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 120 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 110 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 240 +#define SQR_TOOM8_THRESHOLD 333 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 280 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 280, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 13, 7}, { 7, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 21, 8}, \ + { 11, 7}, { 25, 9}, { 7, 8}, { 15, 7}, \ + { 33, 8}, { 19, 7}, { 41, 8}, { 23, 7}, \ + { 49, 8}, { 27, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 79, 9}, { 159, 8}, { 319,10}, \ + { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,10}, { 175,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 66 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 7}, { 47, 8}, { 27, 9}, \ + { 15, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 63, 8}, \ + { 127, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 175,10}, { 95, 9}, { 191, 8}, \ + { 383, 9}, { 207,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287,10}, { 159, 9}, \ + { 319,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 70 +#define SQR_FFT_THRESHOLD 2624 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 51 +#define MULLO_MUL_N_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 51 +#define DC_DIVAPPR_Q_THRESHOLD 202 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 124 + +#define INV_MULMOD_BNM1_THRESHOLD 26 +#define INV_NEWTON_THRESHOLD 266 +#define INV_APPR_THRESHOLD 222 + +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_N_THRESHOLD 59 + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1499 +#define MUPI_DIV_QR_THRESHOLD 116 +#define MU_BDIV_QR_THRESHOLD 1057 +#define MU_BDIV_Q_THRESHOLD 1334 + +#define POWM_SEC_TABLE 6,35,213,724,2618 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 84 +#define HGCD_APPR_THRESHOLD 101 +#define HGCD_REDUCE_THRESHOLD 1437 +#define GCD_DC_THRESHOLD 372 +#define GCDEXT_DC_THRESHOLD 253 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 399 +#define SET_STR_PRECOMPUTE_THRESHOLD 885 + +#define FAC_DSC_THRESHOLD 179 +#define FAC_ODD_THRESHOLD 29 diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm new file mode 100644 index 0000000..0239cd2 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/mul_1.asm @@ -0,0 +1,83 @@ +dnl SPARC T1 32-bit mpn_mul_1. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 20 +C UltraSPARC T2: 18 +C UltraSPARC T3: 18 +C UltraSPARC T4: 4 + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`n', `%o2') +define(`v0', `%o3') + +ASM_START() +PROLOGUE(mpn_mul_1) + srl n, 0, n + srl v0, 0, v0 + subcc n, 1, n + be L(final_one) + clr %o5 + +L(top): lduw [up+0], %g1 + lduw [up+4], %g2 + mulx %g1, v0, %g3 + add up, 8, up + mulx %g2, v0, %o4 + sub n, 2, n + add rp, 8, rp + add %o5, %g3, %g3 + stw %g3, [rp-8] + srlx %g3, 32, %o5 + add %o5, %o4, %o4 + stw %o4, [rp-4] + brgz n, L(top) + srlx %o4, 32, %o5 + + brlz,pt n, L(done) + nop + +L(final_one): + lduw [up+0], %g1 + mulx %g1, v0, %g3 + add %o5, %g3, %g3 + stw %g3, [rp+0] + srlx %g3, 32, %o5 + +L(done): + retl + mov %o5, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/sqr_diagonal.asm new file mode 100644 index 0000000..3b906ef --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/sqr_diagonal.asm @@ -0,0 +1,55 @@ +dnl SPARC T1 32-bit mpn_sqr_diagonal. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`n', `%o2') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + deccc n C n-- + nop + +L(top): lduw [up+0], %g1 + add up, 4, up C up++ + mulx %g1, %g1, %g3 + stw %g3, [rp+0] + srlx %g3, 32, %g4 + stw %g4, [rp+4] + add rp, 8, rp C rp += 2 + bnz %icc, L(top) + deccc n C n-- + + retl + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm new file mode 100644 index 0000000..946bc3f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/sub_n.asm @@ -0,0 +1,70 @@ +dnl SPARC T1 32-bit mpn_sub_n. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', %o0) +define(`ap', %o1) +define(`bp', %o2) +define(`n', %o3) +define(`cy', %o4) + +define(`i', %o3) + +MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(mpn_sub_nc) + b L(ent) + srl cy, 0, cy C strip any bogus high bits +EPILOGUE() + +PROLOGUE(mpn_sub_n) + mov 0, cy +L(ent): srl n, 0, n C strip any bogus high bits + sll n, 2, n + add ap, n, ap + add bp, n, bp + add rp, n, rp + neg n, i + +L(top): lduw [ap+i], %g1 + lduw [bp+i], %g2 + sub %g1, %g2, %g3 + sub %g3, cy, %g3 + stw %g3, [rp+i] + add i, 4, i + brnz i, L(top) + srlx %g3, 63, cy + + retl + mov cy, %o0 C return value +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm b/gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm new file mode 100644 index 0000000..8920070 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/ultrasparct1/submul_1.asm @@ -0,0 +1,91 @@ +dnl SPARC T1 32-bit mpn_submul_1. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2010, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 24 +C UltraSPARC T2: 19 +C UltraSPARC T3: 19 +C UltraSPARC T4: 5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() +PROLOGUE(mpn_submul_1) + save %sp, -96, %sp + srl n, 0, %o4 + srl v0, 0, %g1 + subcc %o4, 1, %o4 + be L(final_one) + subcc %g0, 0, %o5 + +L(top): lduw [up+0], %l0 + lduw [rp+0], %l2 + lduw [up+4], %l1 + lduw [rp+4], %l3 + mulx %l0, %g1, %g3 + add up, 8, up + mulx %l1, %g1, %o3 + sub %o4, 2, %o4 + add rp, 8, rp + addx %o5, %g3, %g3 + srlx %g3, 32, %o5 + subcc %l2, %g3, %g3 + stw %g3, [rp-8] + addx %o5, %o3, %o3 + srlx %o3, 32, %o5 + subcc %l3, %o3, %o3 + brgz %o4, L(top) + stw %o3, [rp-4] + + brlz,pt %o4, L(done) + nop + +L(final_one): + lduw [up+0], %l0 + lduw [rp+0], %l2 + mulx %l0, %g1, %g3 + addx %o5, %g3, %g3 + srlx %g3, 32, %o5 + subcc %l2, %g3, %g3 + stw %g3, [rp+0] + +L(done): + addx %o5, 0, %o5 + ret + restore %o5, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc32/umul.asm b/gmp-6.3.0/mpn/sparc32/umul.asm new file mode 100644 index 0000000..3a20b95 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/umul.asm @@ -0,0 +1,77 @@ +dnl SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + wr %g0,%o1,%y + sra %o2,31,%g2 C Don't move this insn + and %o1,%g2,%g2 C Don't move this insn + andcc %g0,0,%g1 C Don't move this insn + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,0,%g1 + rd %y,%g3 + st %g3,[%o0] + retl + add %g1,%g2,%o0 +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm new file mode 100644 index 0000000..0bf1b24 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/addmul_1.asm @@ -0,0 +1,109 @@ +dnl SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright 1992-1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_addmul_1) + ld [%o1+0],%o4 + andcc %o2,1,%g0 + be L(bx0) + andcc %o2,2,%g0 +L(bx1): be L(01) + orcc %g0,%g0,%g2 +L(b11): add %o0,-8,%o0 + b L(11) + add %o1,-8,%o1 +L(bx0): be L(b00) + orcc %g0,%g0,%g2 +L(b10): add %o0,-12,%o0 + b L(10) + add %o1,4,%o1 +L(b00): add %o0,-4,%o0 + b L(00) + add %o1,-4,%o1 + +L(top): addcc %g3,%g2,%g3 C 1 + ld [%o1+4],%o4 C 2 + rd %y,%g2 C 1 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] C 1 +L(00): umul %o4,%o3,%g3 C 2 + ld [%o0+4],%g1 C 2 + addxcc %g3,%g2,%g3 C 2 + ld [%o1+8],%o4 C 3 + rd %y,%g2 C 2 + addx %g0,%g2,%g2 + nop + addcc %g1,%g3,%g3 + st %g3,[%o0+4] C 2 +L(11): umul %o4,%o3,%g3 C 3 + addxcc %g3,%g2,%g3 C 3 + ld [%o1+12],%o4 C 4 + rd %y,%g2 C 3 + add %o1,16,%o1 + addx %g0,%g2,%g2 + ld [%o0+8],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+8] C 3 +L(10): umul %o4,%o3,%g3 C 4 + addxcc %g3,%g2,%g3 C 4 + ld [%o1+0],%o4 C 1 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 + ld [%o0+12],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+12] C 4 + add %o0,16,%o0 + addx %g0,%g2,%g2 +L(01): addcc %o2,-4,%o2 + bg L(top) + umul %o4,%o3,%g3 C 1 + + addcc %g3,%g2,%g3 C 4 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] C 4 + + retl + addx %g0,%g2,%o0 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h new file mode 100644 index 0000000..e57897b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/gmp-mparam.h @@ -0,0 +1,73 @@ +/* SPARC v8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* Generated by tuneup.c, 2004-02-07, gcc 2.95 */ + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 65 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 65 + +#define DIV_SB_PREINV_THRESHOLD 5 +#define DIV_DC_THRESHOLD 24 +#define POWM_THRESHOLD 38 + +#define HGCD_THRESHOLD 69 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 498 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 6 +#define DIVREM_1_UNNORM_THRESHOLD 11 +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 4 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_THRESHOLD 1679 + +#define MUL_FFT_TABLE { 272, 672, 1152, 2560, 10240, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 264 +#define MUL_FFT_THRESHOLD 1792 + +#define SQR_FFT_TABLE { 304, 672, 1152, 3584, 10240, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 264 +#define SQR_FFT_THRESHOLD 1728 diff --git a/gmp-6.3.0/mpn/sparc32/v8/mul_1.asm b/gmp-6.3.0/mpn/sparc32/v8/mul_1.asm new file mode 100644 index 0000000..d03a0e6 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/mul_1.asm @@ -0,0 +1,93 @@ +dnl SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and +dnl store the product in a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_mul_1) + ld [%o1+0],%o4 + andcc %o2,1,%g0 + be L(bx0) + andcc %o2,2,%g0 +L(bx1): be L(01) + orcc %g0,%g0,%g2 +L(b11): add %o0,-8,%o0 + b L(11) + add %o1,-8,%o1 +L(bx0): be L(b00) + orcc %g0,%g0,%g2 +L(b10): add %o0,-12,%o0 + b L(10) + add %o1,4,%o1 +L(b00): add %o0,-4,%o0 + b L(00) + add %o1,-4,%o1 + +L(top): addcc %g3,%g2,%g3 C 1 + ld [%o1+4],%o4 C 2 + st %g3,[%o0+0] C 1 + rd %y,%g2 C 1 +L(00): umul %o4,%o3,%g3 C 2 + addxcc %g3,%g2,%g3 C 2 + ld [%o1+8],%o4 C 3 + st %g3,[%o0+4] C 2 + rd %y,%g2 C 2 +L(11): umul %o4,%o3,%g3 C 3 + addxcc %g3,%g2,%g3 C 3 + ld [%o1+12],%o4 C 4 + add %o1,16,%o1 + st %g3,[%o0+8] C 3 + rd %y,%g2 C 3 +L(10): umul %o4,%o3,%g3 C 4 + addxcc %g3,%g2,%g3 C 4 + ld [%o1+0],%o4 C 1 + st %g3,[%o0+12] C 4 + add %o0,16,%o0 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 +L(01): addcc %o2,-4,%o2 + bg L(top) + umul %o4,%o3,%g3 C 1 + + addcc %g3,%g2,%g3 C 4 + st %g3,[%o0+0] C 4 + rd %y,%g2 C 4 + + retl + addx %g0,%g2,%o0 +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v8/submul_1.asm b/gmp-6.3.0/mpn/sparc32/v8/submul_1.asm new file mode 100644 index 0000000..187314e --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/submul_1.asm @@ -0,0 +1,67 @@ +dnl SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1992-1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_submul_1) + sub %g0,%o2,%o2 C negate ... + sll %o2,2,%o2 C ... and scale size + sub %o1,%o2,%o1 C o1 is offset s1_ptr + sub %o0,%o2,%g1 C g1 is offset res_ptr + + mov 0,%o0 C clear cy_limb + +L(loop): + ld [%o1+%o2],%o4 + ld [%g1+%o2],%g2 + umul %o4,%o3,%o5 + rd %y,%g3 + addcc %o5,%o0,%o5 + addx %g3,0,%o0 + subcc %g2,%o5,%g2 + addx %o0,0,%o0 + st %g2,[%g1+%o2] + + addcc %o2,4,%o2 + bne L(loop) + nop + + retl + nop +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h new file mode 100644 index 0000000..1ac9239 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/supersparc/gmp-mparam.h @@ -0,0 +1,73 @@ +/* SuperSPARC gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* Generated by tuneup.c, 2004-02-10, gcc 3.3 */ + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 81 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 86 + +#define DIV_SB_PREINV_THRESHOLD 0 /* always */ +#define DIV_DC_THRESHOLD 26 +#define POWM_THRESHOLD 79 + +#define HGCD_THRESHOLD 97 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 470 +#define JACOBI_BASE_METHOD 2 + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define USE_PREINV_DIVREM_1 1 +#define USE_PREINV_MOD_1 1 +#define DIVREM_2_THRESHOLD 0 /* always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ + +#define GET_STR_DC_THRESHOLD 19 +#define GET_STR_PRECOMPUTE_THRESHOLD 34 +#define SET_STR_THRESHOLD 3524 + +#define MUL_FFT_TABLE { 304, 800, 1408, 3584, 10240, 24576, 0 } +#define MUL_FFT_MODF_THRESHOLD 264 +#define MUL_FFT_THRESHOLD 2304 + +#define SQR_FFT_TABLE { 336, 800, 1408, 3584, 10240, 24576, 0 } +#define SQR_FFT_MODF_THRESHOLD 280 +#define SQR_FFT_THRESHOLD 2304 diff --git a/gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm b/gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm new file mode 100644 index 0000000..12f66ce --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/supersparc/udiv.asm @@ -0,0 +1,131 @@ +dnl SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h. +dnl This is for SuperSPARC only, to compensate for its semi-functional +dnl udiv instruction. + +dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() + +ifdef(`PIC', +` TEXT +L(getpc): + retl + nop') + + TEXT + ALIGN(8) +L(C0): .double 0r4294967296 +L(C1): .double 0r2147483648 + +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + st %i1,[%fp-8] + ld [%fp-8],%f10 + +ifdef(`PIC', +`L(pc): call L(getpc) C put address of this insn in %o7 + ldd [%o7+L(C0)-L(pc)],%f8', +` sethi %hi(L(C0)),%o7 + ldd [%o7+%lo(L(C0))],%f8') + + fitod %f10,%f4 + cmp %i1,0 + bge L(248) + mov %i0,%i5 + faddd %f4,%f8,%f4 +L(248): + st %i2,[%fp-8] + ld [%fp-8],%f10 + fmuld %f4,%f8,%f6 + cmp %i2,0 + bge L(249) + fitod %f10,%f2 + faddd %f2,%f8,%f2 +L(249): + st %i3,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + cmp %i3,0 + bge L(250) + fitod %f10,%f4 + faddd %f4,%f8,%f4 +L(250): + fdivd %f2,%f4,%f2 + +ifdef(`PIC', +` ldd [%o7+L(C1)-L(pc)],%f4', +` sethi %hi(L(C1)),%o7 + ldd [%o7+%lo(L(C1))],%f4') + + fcmped %f2,%f4 + nop + fbge,a L(251) + fsubd %f2,%f4,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(252) + ld [%fp-8],%i4 +L(251): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + sethi %hi(-2147483648),%g2 + xor %i4,%g2,%i4 +L(252): + umul %i3,%i4,%g3 + rd %y,%i0 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(253) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(253): + blu L(246) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(246): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc32/v8/udiv.asm b/gmp-6.3.0/mpn/sparc32/v8/udiv.asm new file mode 100644 index 0000000..12f66ce --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/udiv.asm @@ -0,0 +1,131 @@ +dnl SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h. +dnl This is for SuperSPARC only, to compensate for its semi-functional +dnl udiv instruction. + +dnl Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() + +ifdef(`PIC', +` TEXT +L(getpc): + retl + nop') + + TEXT + ALIGN(8) +L(C0): .double 0r4294967296 +L(C1): .double 0r2147483648 + +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + st %i1,[%fp-8] + ld [%fp-8],%f10 + +ifdef(`PIC', +`L(pc): call L(getpc) C put address of this insn in %o7 + ldd [%o7+L(C0)-L(pc)],%f8', +` sethi %hi(L(C0)),%o7 + ldd [%o7+%lo(L(C0))],%f8') + + fitod %f10,%f4 + cmp %i1,0 + bge L(248) + mov %i0,%i5 + faddd %f4,%f8,%f4 +L(248): + st %i2,[%fp-8] + ld [%fp-8],%f10 + fmuld %f4,%f8,%f6 + cmp %i2,0 + bge L(249) + fitod %f10,%f2 + faddd %f2,%f8,%f2 +L(249): + st %i3,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + cmp %i3,0 + bge L(250) + fitod %f10,%f4 + faddd %f4,%f8,%f4 +L(250): + fdivd %f2,%f4,%f2 + +ifdef(`PIC', +` ldd [%o7+L(C1)-L(pc)],%f4', +` sethi %hi(L(C1)),%o7 + ldd [%o7+%lo(L(C1))],%f4') + + fcmped %f2,%f4 + nop + fbge,a L(251) + fsubd %f2,%f4,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(252) + ld [%fp-8],%i4 +L(251): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + sethi %hi(-2147483648),%g2 + xor %i4,%g2,%i4 +L(252): + umul %i3,%i4,%g3 + rd %y,%i0 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(253) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(253): + blu L(246) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(246): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc32/v8/umul.asm b/gmp-6.3.0/mpn/sparc32/v8/umul.asm new file mode 100644 index 0000000..1a2e84b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v8/umul.asm @@ -0,0 +1,40 @@ +dnl SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc. + +dnl Copyright 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + umul %o1,%o2,%g2 + st %g2,[%o0] + retl + rd %y,%o0 +EPILOGUE(mpn_umul_ppmm) diff --git a/gmp-6.3.0/mpn/sparc32/v9/README b/gmp-6.3.0/mpn/sparc32/v9/README new file mode 100644 index 0000000..9b39713 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/README @@ -0,0 +1,4 @@ +Code for SPARC processors implementing version 9 of the SPARC architecture. +This code is for systems that doesn't preserve the full 64-bit contents of +integer register at context switch. For other systems (such as Solaris 7 or +later) use the code in ../../sparc64. diff --git a/gmp-6.3.0/mpn/sparc32/v9/add_n.asm b/gmp-6.3.0/mpn/sparc32/v9/add_n.asm new file mode 100644 index 0000000..7bd5974 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/add_n.asm @@ -0,0 +1,129 @@ +dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(rp,%o0) +define(s1p,%o1) +define(s2p,%o2) +define(n,%o3) +define(cy,%g1) + +C This code uses 64-bit operations on `o' and `g' registers. It doesn't +C require that `o' registers' upper 32 bits are preserved by the operating +C system, but if they are not, they must be zeroed. That is indeed what +C happens at least on Slowaris 2.5 and 2.6. + +C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at +C about 10 cycles/limb from the Ecache. + +ASM_START() +PROLOGUE(mpn_add_n) + lduw [s1p+0],%o4 + lduw [s2p+0],%o5 + addcc n,-2,n + bl,pn %icc,L(end1) + lduw [s1p+4],%g2 + lduw [s2p+4],%g3 + be,pn %icc,L(end2) + mov 0,cy + + .align 16 +L(loop): + add %o4,%o5,%g4 + add rp,8,rp + lduw [s1p+8],%o4 + fitod %f0,%f2 +C --- + add cy,%g4,%g4 + addcc n,-1,n + lduw [s2p+8],%o5 + fitod %f0,%f2 +C --- + srlx %g4,32,cy + add s2p,8,s2p + stw %g4,[rp-8] + be,pn %icc,L(exito)+4 +C --- + add %g2,%g3,%g4 + addcc n,-1,n + lduw [s1p+12],%g2 + fitod %f0,%f2 +C --- + add cy,%g4,%g4 + add s1p,8,s1p + lduw [s2p+4],%g3 + fitod %f0,%f2 +C --- + srlx %g4,32,cy + bne,pt %icc,L(loop) + stw %g4,[rp-4] +C --- +L(exite): + add %o4,%o5,%g4 + add cy,%g4,%g4 + srlx %g4,32,cy + stw %g4,[rp+0] + add %g2,%g3,%g4 + add cy,%g4,%g4 + stw %g4,[rp+4] + retl + srlx %g4,32,%o0 + +L(exito): + add %g2,%g3,%g4 + add cy,%g4,%g4 + srlx %g4,32,cy + stw %g4,[rp-4] + add %o4,%o5,%g4 + add cy,%g4,%g4 + stw %g4,[rp+0] + retl + srlx %g4,32,%o0 + +L(end1): + add %o4,%o5,%g4 + stw %g4,[rp+0] + retl + srlx %g4,32,%o0 + +L(end2): + add %o4,%o5,%g4 + srlx %g4,32,cy + stw %g4,[rp+0] + add %g2,%g3,%g4 + add cy,%g4,%g4 + stw %g4,[rp+4] + retl + srlx %g4,32,%o0 +EPILOGUE(mpn_add_n) diff --git a/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm new file mode 100644 index 0000000..2adf7a8 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/addmul_1.asm @@ -0,0 +1,306 @@ +dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_addmul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + lduw [%o0], %g5 C read rp[i] + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + lduw [%o0], %g5 C read rp[i] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + lduw [%o0], %g5 C read rp[i] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: nop + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + nop + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + nop + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + add %o1, 4, %o1 C up++ + stw %g4, [%o0-4] + fanop +C -- 6 + srlx %g4, 32, %g3 C new cy + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + lduw [%o0+4], %g5 C read rp[i] + +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + add %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + lduw [%o0+8], %g5 C read rp[i] + +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + lduw [%o0+12], %g5 C read rp[i] + +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + lduw [%o0+16], %g5 C read rp[i] + +.L1: sllx %g2, 16, %g4 C (p16 << 16) + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + add %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + mov %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h b/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h new file mode 100644 index 0000000..f909e2c --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/gmp-mparam.h @@ -0,0 +1,204 @@ +/* SPARC v9 32-bit gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009-2011, 2014 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */ +/* FFT tuning limit = 25000000 */ +/* Generated by tuneup.c, 2014-03-16, gcc 3.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 13 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 32 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 43 +#define MUL_TOOM44_THRESHOLD 126 +#define MUL_TOOM6H_THRESHOLD 161 +#define MUL_TOOM8H_THRESHOLD 208 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 80 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 55 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 72 + +#define SQR_BASECASE_THRESHOLD 4 +#define SQR_TOOM2_THRESHOLD 64 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 152 +#define SQR_TOOM6_THRESHOLD 185 +#define SQR_TOOM8_THRESHOLD 324 + +#define MULMID_TOOM42_THRESHOLD 64 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 288 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 288, 5}, { 9, 4}, { 19, 5}, { 11, 6}, \ + { 6, 5}, { 14, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 20, 6}, { 13, 7}, { 7, 6}, \ + { 16, 7}, { 9, 6}, { 19, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \ + { 31, 7}, { 19, 8}, { 11, 7}, { 23, 9}, \ + { 7, 8}, { 15, 7}, { 31, 8}, { 19, 7}, \ + { 39, 8}, { 27, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47,10}, { 31, 9}, { 71, 8}, \ + { 143, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135, 8}, { 271, 9}, \ + { 143, 8}, { 287,10}, { 79, 9}, { 175,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \ + { 63,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 415, 8}, \ + { 831,12}, { 63,11}, { 127,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895, 8}, { 1791,12}, { 127,11}, \ + { 287,10}, { 607, 9}, { 1215, 8}, { 2431,11}, \ + { 319, 9}, { 1279,11}, { 351,12}, { 191,11}, \ + { 415,10}, { 831,11}, { 447,10}, { 895, 9}, \ + { 1791,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 703,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 895,10}, { 1791,11}, { 959,13}, { 255,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 703,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1407,11}, { 2943,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 143 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 8, 4}, { 17, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 20, 7}, { 11, 6}, { 23, 7}, { 13, 8}, \ + { 7, 7}, { 19, 8}, { 11, 7}, { 25, 9}, \ + { 7, 8}, { 15, 7}, { 33, 8}, { 19, 7}, \ + { 39, 8}, { 23, 9}, { 15, 8}, { 39, 9}, \ + { 23,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 47,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 71, 8}, { 143, 7}, { 287, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 143, 8}, { 287,10}, { 79, 9}, \ + { 159, 8}, { 319, 9}, { 175, 8}, { 351, 7}, \ + { 703,10}, { 95, 9}, { 191, 8}, { 383, 9}, \ + { 207, 8}, { 415, 9}, { 223,11}, { 63,10}, \ + { 127, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415, 8}, { 831,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 351, 9}, { 703, 8}, \ + { 1407,11}, { 191,10}, { 415, 9}, { 831,11}, \ + { 223,10}, { 447, 9}, { 895,10}, { 479,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 415,10}, { 831,11}, \ + { 447,10}, { 895, 9}, { 1791,13}, { 127,12}, \ + { 255,11}, { 575,12}, { 319,11}, { 703,10}, \ + { 1407,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 959,10}, { 1919, 9}, { 3839,13}, { 255,12}, \ + { 575,11}, { 1151,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1407,13}, \ + { 767,12}, { 1599,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 153 +#define SQR_FFT_THRESHOLD 2112 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 144 +#define MULLO_MUL_N_THRESHOLD 4292 + +#define DC_DIV_QR_THRESHOLD 74 +#define DC_DIVAPPR_Q_THRESHOLD 406 +#define DC_BDIV_QR_THRESHOLD 63 +#define DC_BDIV_Q_THRESHOLD 363 + +#define INV_MULMOD_BNM1_THRESHOLD 108 +#define INV_NEWTON_THRESHOLD 351 +#define INV_APPR_THRESHOLD 303 + +#define BINV_NEWTON_THRESHOLD 354 +#define REDC_1_TO_REDC_N_THRESHOLD 61 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 1099 +#define MUPI_DIV_QR_THRESHOLD 118 +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 979 + +#define POWM_SEC_TABLE 3,22,127,624,779,2351 + +#define MATRIX22_STRASSEN_THRESHOLD 7 +#define HGCD_THRESHOLD 90 +#define HGCD_APPR_THRESHOLD 123 +#define HGCD_REDUCE_THRESHOLD 1494 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 192 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 290 +#define SET_STR_PRECOMPUTE_THRESHOLD 634 + +#define FAC_DSC_THRESHOLD 156 +#define FAC_ODD_THRESHOLD 25 diff --git a/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm new file mode 100644 index 0000000..40aeffa --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/mul_1.asm @@ -0,0 +1,287 @@ +dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_mul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: nop + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + nop + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + srlx %g4, 32, %g3 C new cy + add %o1, 4, %o1 C up++ + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + stw %g4, [%o0-4] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + +.L1: sllx %g2, 16, %g4 C (p16 << 16) + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + mov %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm new file mode 100644 index 0000000..e024279 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/sqr_diagonal.asm @@ -0,0 +1,462 @@ +dnl SPARC v9 32-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 + +C This code uses a very deep software pipeline, due to the need for moving data +C forth and back between the integer registers and floating-point registers. +C +C A VIS variant of this code would make the pipeline less deep, since the +C masking now done in the integer unit could take place in the floating-point +C unit using the FAND instruction. It would be possible to save several cycles +C too. +C +C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and +C not much slower from the Ecache. It would perhaps be possible to shave off +C one cycle, but not easily. We cannot do better than 10 cycles/limb with the +C used instructions, since we have 10 memory operations per limb. But a VIS +C variant could run three cycles faster than the corresponding non-VIS code. + +C This is non-pipelined code showing the algorithm: +C +C .Loop: +C lduw [up+0],%g4 C 00000000hhhhllll +C sllx %g4,16,%g3 C 0000hhhhllll0000 +C or %g3,%g4,%g2 C 0000hhhhXXXXllll +C andn %g2,%g5,%g2 C 0000hhhh0000llll +C stx %g2,[%fp+80] +C ldd [%fp+80],%f0 +C fitod %f0,%f4 C hi16 +C fitod %f1,%f6 C lo16 +C ld [up+0],%f9 +C fxtod %f8,%f2 +C fmuld %f2,%f4,%f4 +C fmuld %f2,%f6,%f6 +C fdtox %f4,%f4 +C fdtox %f6,%f6 +C std %f4,[%fp-24] +C std %f6,[%fp-16] +C ldx [%fp-24],%g2 +C ldx [%fp-16],%g1 +C sllx %g2,16,%g2 +C add %g2,%g1,%g1 +C stw %g1,[rp+0] +C srlx %g1,32,%l0 +C stw %l0,[rp+4] +C add up,4,up +C subcc n,1,n +C bne,pt %icc,.Loop +C add rp,8,rp + +define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe + +ASM_START() + + TEXT + ALIGN(4) +.Lnoll: + .word 0 + +PROLOGUE(mpn_sqr_diagonal) + save %sp,-256,%sp + +ifdef(`PIC', +`.Lpc: rd %pc,%o7 + ld [%o7+.Lnoll-.Lpc],%f8', +` sethi %hi(.Lnoll),%g1 + ld [%g1+%lo(.Lnoll)],%f8') + + sethi %hi(0xffff0000),%g5 + add %i1,-8,%i1 + + lduw [%i1+8],%g4 + add %i1,4,%i1 C s1_ptr++ + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + bne,pt %icc,.L_grt_1 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + add %i1,4,%i1 C s1_ptr++ + stx %g2,[%fp+80] + ld [%i1],%f9 + ldd [%fp+80],%f0 + fxtod %f8,%f2 + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + fmuld %f2,%f6,%f6 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + std %f6,[%fp-16] + + add %fp, 80, %l3 + add %fp, -24, %l4 + add %fp, 72, %l5 + b .L1 + add %fp, -40, %l6 + +.L_grt_1: + stx %g2,[%fp+80] + lduw [%i1+8],%g4 + add %i1,4,%i1 C s1_ptr++ + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + bne,pt %icc,.L_grt_2 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+72] + ld [%i1],%f9 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fxtod %f8,%f2 + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + ldd [%fp+72],%f0 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + fmuld %f2,%f6,%f6 + fdtox %f4,%f4 + + add %fp, 72, %l3 + add %fp, -40, %l4 + add %fp, 80, %l5 + b .L2 + add %fp, -24, %l6 + +.L_grt_2: + stx %g2,[%fp+72] + lduw [%i1+8],%g4 + ld [%i1],%f9 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + fxtod %f8,%f2 + bne,pt %icc,.L_grt_3 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+80] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + add %fp, 80, %l3 + fmuld %f2,%f6,%f6 + add %fp, -24, %l4 + ldd [%fp+80],%f0 + add %fp, 72, %l5 + fdtox %f4,%f4 + b .L3 + add %fp, -40, %l6 + +.L_grt_3: + stx %g2,[%fp+80] + fitod %f0,%f4 + lduw [%i1+8],%g4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fdtox %f4,%f4 + sllx %g4,16,%g3 C 0000hhhhllll0000 + fdtox %f6,%f6 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + bne,pt %icc,.L_grt_4 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+72] + fitod %f0,%f4 + fitod %f1,%f6 + add %fp, 72, %l3 + fmuld %f2,%f4,%f4 + add %fp, -40, %l4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + add %fp, 80, %l5 + fdtox %f4,%f4 + b .L4 + add %fp, -24, %l6 + +.L_grt_4: + stx %g2,[%fp+72] + fitod %f0,%f4 + lduw [%i1+8],%g4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fdtox %f4,%f4 + sllx %g4,16,%g3 C 0000hhhhllll0000 + fdtox %f6,%f6 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-40] + fxtod %f8,%f2 + std %f6,[%fp-32] + be,pn %icc,.L5 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + b,a .Loop + + .align 16 +C --- LOOP BEGIN +.Loop: nop + nop + stx %g2,[%fp+80] + fitod %f0,%f4 +C --- + nop + nop + lduw [%i1+8],%g4 + fitod %f1,%f6 +C --- + nop + nop + ldx [%fp-24],%g2 C p16 + fanop +C --- + nop + nop + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f4,%f4 +C --- + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 +C --- + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fanop +C --- + srlx %g1,32,%l0 + nop + stw %g1,[%i0-8] + fdtox %f4,%f4 +C --- + sllx %g4,16,%g3 C 0000hhhhllll0000 + nop + stw %l0,[%i0-4] + fdtox %f6,%f6 +C --- + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-24] + fxtod %f8,%f2 +C --- + std %f6,[%fp-16] + andn %g2,%g5,%g2 C 0000hhhh0000llll + be,pn %icc,.Lend + fanop +C --- LOOP MIDDLE + nop + nop + stx %g2,[%fp+72] + fitod %f0,%f4 +C --- + nop + nop + lduw [%i1+8],%g4 + fitod %f1,%f6 +C --- + nop + nop + ldx [%fp-40],%g2 C p16 + fanop +C --- + nop + nop + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f4,%f4 +C --- + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 +C --- + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fanop +C --- + srlx %g1,32,%l0 + nop + stw %g1,[%i0-8] + fdtox %f4,%f4 +C --- + sllx %g4,16,%g3 C 0000hhhhllll0000 + nop + stw %l0,[%i0-4] + fdtox %f6,%f6 +C --- + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-40] + fxtod %f8,%f2 +C --- + std %f6,[%fp-32] + andn %g2,%g5,%g2 C 0000hhhh0000llll + bne,pt %icc,.Loop + fanop +C --- LOOP END + +.L5: add %fp, 80, %l3 + add %fp, -24, %l4 + add %fp, 72, %l5 + b .Ltail + add %fp, -40, %l6 + +.Lend: add %fp, 72, %l3 + add %fp, -40, %l4 + add %fp, 80, %l5 + add %fp, -24, %l6 +.Ltail: stx %g2,[%l3] + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%l5],%f0 + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L4: fdtox %f6,%f6 + std %f4,[%l4] + fxtod %f8,%f2 + std %f6,[%l4+8] + + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l6],%g2 C p16 + ldx [%l6+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + ldd [%l3],%f0 + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L3: fdtox %f6,%f6 + std %f4,[%l6] + fxtod %f8,%f2 + std %f6,[%l6+8] + + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L2: fdtox %f6,%f6 + std %f4,[%l4] + std %f6,[%l4+8] + + ldx [%l6],%g2 C p16 + ldx [%l6+8],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + stw %l0,[%i0-4] + +.L1: ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + stw %l0,[%i0-4] + + ret + restore %g0,%g0,%o0 + +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm b/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm new file mode 100644 index 0000000..636c73b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/sub_n.asm @@ -0,0 +1,129 @@ +dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(rp,%o0) +define(s1p,%o1) +define(s2p,%o2) +define(n,%o3) +define(cy,%g1) + +C This code uses 64-bit operations on `o' and `g' registers. It doesn't +C require that `o' registers' upper 32 bits are preserved by the operating +C system, but if they are not, they must be zeroed. That is indeed what +C happens at least on Slowaris 2.5 and 2.6. + +C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at +C about 10 cycles/limb from the Ecache. + +ASM_START() +PROLOGUE(mpn_sub_n) + lduw [s1p+0],%o4 + lduw [s2p+0],%o5 + addcc n,-2,n + bl,pn %icc,L(end1) + lduw [s1p+4],%g2 + lduw [s2p+4],%g3 + be,pn %icc,L(end2) + mov 0,cy + + .align 16 +L(loop): + sub %o4,%o5,%g4 + add rp,8,rp + lduw [s1p+8],%o4 + fitod %f0,%f2 +C --- + sub %g4,cy,%g4 + addcc n,-1,n + lduw [s2p+8],%o5 + fitod %f0,%f2 +C --- + srlx %g4,63,cy + add s2p,8,s2p + stw %g4,[rp-8] + be,pn %icc,L(exito)+4 +C --- + sub %g2,%g3,%g4 + addcc n,-1,n + lduw [s1p+12],%g2 + fitod %f0,%f2 +C --- + sub %g4,cy,%g4 + add s1p,8,s1p + lduw [s2p+4],%g3 + fitod %f0,%f2 +C --- + srlx %g4,63,cy + bne,pt %icc,L(loop) + stw %g4,[rp-4] +C --- +L(exite): + sub %o4,%o5,%g4 + sub %g4,cy,%g4 + srlx %g4,63,cy + stw %g4,[rp+0] + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+4] + retl + srlx %g4,63,%o0 + +L(exito): + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + srlx %g4,63,cy + stw %g4,[rp-4] + sub %o4,%o5,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+0] + retl + srlx %g4,63,%o0 + +L(end1): + sub %o4,%o5,%g4 + stw %g4,[rp+0] + retl + srlx %g4,63,%o0 + +L(end2): + sub %o4,%o5,%g4 + srlx %g4,63,cy + stw %g4,[rp+0] + sub %g2,%g3,%g4 + sub %g4,cy,%g4 + stw %g4,[rp+4] + retl + srlx %g4,63,%o0 +EPILOGUE(mpn_sub_n) diff --git a/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm b/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm new file mode 100644 index 0000000..92d0ce7 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/submul_1.asm @@ -0,0 +1,316 @@ +dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Algorithm: We use two floating-point multiplies per limb product, with the +C invariant v operand split into two 16-bit pieces, and the u operand split +C into 32-bit pieces. We convert the two 48-bit products and transfer them to +C the integer unit. + +C cycles/limb +C UltraSPARC 1&2: 6.5 +C UltraSPARC 3: ? + +C Possible optimizations: +C 1. Combine 32-bit memory operations into 64-bit operations. Since we're +C memory bandwidth limited, this could save 1.5 cycles/limb. +C 2. Unroll the inner loop. Since we already use alternate temporary areas, +C it is very straightforward to unroll, using an exit branch midways. +C Unrolling would allow deeper scheduling which could improve speed for L2 +C cache case. +C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es +C aren't sufficiently apart-scheduled with just two temp areas. +C 4. Specialize for particular v values. If its upper 16 bits are zero, we +C could save many operations. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +define(`FSIZE',224) + +ASM_START() +PROLOGUE(mpn_submul_1) + add %sp, -FSIZE, %sp + sethi %hi(0xffff), %g1 + srl %o3, 16, %g2 + or %g1, %lo(0xffff), %g1 + and %o3, %g1, %g1 + stx %g1, [%sp+104] + stx %g2, [%sp+112] + ldd [%sp+104], %f6 + ldd [%sp+112], %f8 + fxtod %f6, %f6 + fxtod %f8, %f8 + ld [%sp+104], %f10 C zero f10 + + mov 0, %g3 C cy = 0 + +define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe + + add %sp, 160, %o5 C point in scratch area + and %o5, -32, %o5 C align at 0 (mod 32) in scratch area + + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_two_or_more + fxtod %f10, %f2 + + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + std %f12, [%o5+24] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + lduw [%o0], %g5 C read rp[i] + b .L1 + add %o0, -16, %o0 + + .align 16 +.L_two_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fmuld %f2, %f8, %f16 + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_three_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + std %f12, [%o5+8] + lduw [%o0], %g5 C read rp[i] + ldx [%o5+16], %g2 C p16 + ldx [%o5+24], %g1 C p0 + b .L2 + add %o0, -12, %o0 + + .align 16 +.L_three_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_four_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + std %f12, [%o5+24] + lduw [%o0], %g5 C read rp[i] + b .L3 + add %o0, -8, %o0 + + .align 16 +.L_four_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + fdtox %f4, %f12 + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + bne,pt %icc, .L_five_or_more + fxtod %f10, %f2 + + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + b .L4 + add %o0, -4, %o0 + + .align 16 +.L_five_or_more: + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 + ldx [%o5+16], %g2 C p16 + fdtox %f4, %f12 + ldx [%o5+24], %g1 C p0 + std %f14, [%o5+16] + fmuld %f2, %f8, %f16 + std %f12, [%o5+24] + fmuld %f2, %f6, %f4 + add %o1, 4, %o1 C up++ + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 + b,a .L5 + +C BEGIN MAIN LOOP + .align 16 +C -- 0 +.Loop: sub %g0, %g3, %g3 + subcc %o2, 1, %o2 + ld [%o1], %f11 C read up[i] + fdtox %f16, %f14 +C -- 1 + sllx %g2, 16, %g4 C (p16 << 16) + add %o0, 4, %o0 C rp++ + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 +C -- 2 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + fanop +C -- 3 + nop + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 +C -- 4 + nop + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 +C -- 5 + xor %o5, 16, %o5 C alternate scratch variables + add %o1, 4, %o1 C up++ + stw %g4, [%o0-4] + fanop +C -- 6 + srlx %g4, 32, %g3 C new cy + lduw [%o0], %g5 C read rp[i] + bne,pt %icc, .Loop + fxtod %f10, %f2 +C END MAIN LOOP + +.L5: sub %g0, %g3, %g3 + fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g4, %g3, %g4 C p += cy + std %f14, [%o5+0] + fmuld %f2, %f8, %f16 + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + fmuld %f2, %f6, %f4 + xor %o5, 16, %o5 + stw %g4, [%o0+0] + srlx %g4, 32, %g3 C new cy + lduw [%o0+4], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L4: fdtox %f16, %f14 + sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + fdtox %f4, %f12 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + std %f14, [%o5+0] + sub %g5, %g4, %g4 C p += rp[i] + std %f12, [%o5+8] + xor %o5, 16, %o5 + stw %g4, [%o0+4] + srlx %g4, 32, %g3 C new cy + lduw [%o0+8], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L3: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + xor %o5, 16, %o5 + stw %g4, [%o0+8] + srlx %g4, 32, %g3 C new cy + lduw [%o0+12], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L2: sllx %g2, 16, %g4 C (p16 << 16) + ldx [%o5+0], %g2 C p16 + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + ldx [%o5+8], %g1 C p0 + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+12] + srlx %g4, 32, %g3 C new cy + lduw [%o0+16], %g5 C read rp[i] + + sub %g0, %g3, %g3 +.L1: sllx %g2, 16, %g4 C (p16 << 16) + srl %g3, 0, %g3 C zero most significant 32 bits + add %g1, %g4, %g4 C p = p0 + (p16 << 16) + add %g3, %g4, %g4 C p += cy + sub %g5, %g4, %g4 C p += rp[i] + stw %g4, [%o0+16] + srlx %g4, 32, %g3 C new cy + + sub %g0, %g3, %o0 + retl + sub %sp, -FSIZE, %sp +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/sparc32/v9/udiv.asm b/gmp-6.3.0/mpn/sparc32/v9/udiv.asm new file mode 100644 index 0000000..61dde97 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc32/v9/udiv.asm @@ -0,0 +1,52 @@ +dnl SPARC v9 32-bit mpn_udiv_qrnnd - division support for longlong.h. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr o0 +C n1 o1 +C n0 o2 +C d o3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + sllx %o1, 32, %g1 C shift upper dividend limb + srl %o2, 0, %g2 C zero extend lower dividend limb + srl %o3, 0, %g3 C zero extend divisor + or %g2, %g1, %g1 C assemble 64-bit dividend + udivx %g1, %g3, %g1 + mulx %g1, %g3, %g4 + sub %g2, %g4, %g2 + st %g2, [%o0] C store remainder + retl + mov %g1, %o0 C return quotient +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/sparc64/README b/gmp-6.3.0/mpn/sparc64/README new file mode 100644 index 0000000..e2c051a --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/README @@ -0,0 +1,125 @@ +Copyright 1997, 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions for 64-bit V9 SPARC + +RELEVANT OPTIMIZATION ISSUES + +Notation: + IANY = shift/add/sub/logical/sethi + IADDLOG = add/sub/logical/sethi + MEM = ld*/st* + FA = fadd*/fsub*/f*to*/fmov* + FM = fmul* + +UltraSPARC can issue four instructions per cycle, with these restrictions: +* Two IANY instructions, but only one of these may be a shift. If there is a + shift and an IANY instruction, the shift must precede the IANY instruction. +* One FA. +* One FM. +* One branch. +* One MEM. +* IANY/IADDLOG/MEM must be insn 1, 2, or 3 in an issue bundle. Taken branches + should not be in slot 4, since that makes the delay insn come from separate + bundle. +* If two IANY/IADDLOG instructions are to be executed in the same cycle and one + of these is setting the condition codes, that instruction must be the second + one. + +To summarize, ignoring branches, these are the bundles that can reach the peak +execution speed: + +insn1 iany iany mem iany iany mem iany iany mem +insn2 iaddlog mem iany mem iaddlog iany mem iaddlog iany +insn3 mem iaddlog iaddlog fa fa fa fm fm fm +insn4 fa/fm fa/fm fa/fm fm fm fm fa fa fa + +The 64-bit integer multiply instruction mulx takes from 5 cycles to 35 cycles, +depending on the position of the most significant bit of the first source +operand. When used for 32x32->64 multiplication, it needs 20 cycles. +Furthermore, it stalls the processor while executing. We stay away from that +instruction, and instead use floating-point operations. + +Floating-point add and multiply units are fully pipelined. The latency for +UltraSPARC-1/2 is 3 cycles and for UltraSPARC-3 it is 4 cycles. + +Integer conditional move instructions cannot dual-issue with other integer +instructions. No conditional move can issue 1-5 cycles after a load. (This +might have been fixed for UltraSPARC-3.) + +The UltraSPARC-3 pipeline is very simular to the one of UltraSPARC-1/2 , but is +somewhat slower. Branches execute slower, and there may be other new stalls. +But integer multiply doesn't stall the entire CPU and also has a much lower +latency. But it's still not pipelined, and thus useless for our needs. + +STATUS + +* mpn_lshift, mpn_rshift: The current code runs at 2.0 cycles/limb on + UltraSPARC-1/2 and 2.65 on UltraSPARC-3. For UltraSPARC-1/2, the IEU0 + functional unit is saturated with shifts. + +* mpn_add_n, mpn_sub_n: The current code runs at 4 cycles/limb on + UltraSPARC-1/2 and 4.5 cycles/limb on UltraSPARC-3. The 4 instruction + recurrency is the speed limiter. + +* mpn_addmul_1: The current code runs at 14 cycles/limb asymptotically on + UltraSPARC-1/2 and 17.5 cycles/limb on UltraSPARC-3. On UltraSPARC-1/2, the + code sustains 4 instructions/cycle. It might be possible to invent a better + way of summing the intermediate 49-bit operands, but it is unlikely that it + will save enough instructions to save an entire cycle. + + The load-use of the u operand is not enough scheduled for good L2 cache + performance. The UltraSPARC-1/2 L1 cache is direct mapped, and since we use + temporary stack slots that will conflict with the u and r operands, we miss + to L2 very often. The load-use of the std/ldx pairs via the stack are + perhaps over-scheduled. + + It would be possible to save two instructions: (1) The mov could be avoided + if the std/ldx were less scheduled. (2) The ldx of the r operand could be + split into two ld instructions, saving the shifts/masks. + + It should be possible to reach 14 cycles/limb for UltraSPARC-3 if the fp + operations where rescheduled for this processor's 4-cycle latency. + +* mpn_mul_1: The current code is a straightforward edit of the mpn_addmul_1 + code. It would be possible to shave one or two cycles from it, with some + labour. + +* mpn_submul_1: Simpleminded code just calling mpn_mul_1 + mpn_sub_n. This + means that it runs at 18 cycles/limb on UltraSPARC-1/2 and 23 cycles/limb on + UltraSPARC-3. It would be possible to either match the mpn_addmul_1 + performance, or in the worst case use one more instruction group. + +* US1/US2 cache conflict resolving. The direct mapped L1 date cache of US1/US2 + is a problem for mul_1, addmul_1 (and a prospective submul_1). We should + allocate a larger cache area, and put the stack temp area in a place that + doesn't cause cache conflicts. diff --git a/gmp-6.3.0/mpn/sparc64/copyd.asm b/gmp-6.3.0/mpn/sparc64/copyd.asm new file mode 100644 index 0000000..ab105d3 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/copyd.asm @@ -0,0 +1,89 @@ +dnl SPARC v9 mpn_copyd -- Copy a limb vector, decrementing. + +dnl Copyright 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 +C UltraSPARC 3: 2.5 +C UltraSPARC T1: 17 +C UltraSPARC T3: 6 +C UltraSPARC T4/T5: 2 + +C INPUT PARAMETERS +C rptr %o0 +C sptr %o1 +C n %o2 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_copyd) + sllx %o2,3,%g1 + add %g1,%o0,%o0 + add %g1,%o1,%o1 + addcc %o2,-8,%o2 + bl,pt %xcc,L(end01234567) + nop +L(loop1): + ldx [%o1-8],%g1 + ldx [%o1-16],%g2 + ldx [%o1-24],%g3 + ldx [%o1-32],%g4 + ldx [%o1-40],%g5 + ldx [%o1-48],%o3 + ldx [%o1-56],%o4 + ldx [%o1-64],%o5 + add %o1,-64,%o1 + stx %g1,[%o0-8] + stx %g2,[%o0-16] + stx %g3,[%o0-24] + stx %g4,[%o0-32] + stx %g5,[%o0-40] + stx %o3,[%o0-48] + stx %o4,[%o0-56] + stx %o5,[%o0-64] + addcc %o2,-8,%o2 + bge,pt %xcc,L(loop1) + add %o0,-64,%o0 +L(end01234567): + addcc %o2,8,%o2 + bz,pn %xcc,L(end) + nop +L(loop2): + ldx [%o1-8],%g1 + add %o1,-8,%o1 + addcc %o2,-1,%o2 + stx %g1,[%o0-8] + bg,pt %xcc,L(loop2) + add %o0,-8,%o0 +L(end): retl + nop +EPILOGUE(mpn_copyd) diff --git a/gmp-6.3.0/mpn/sparc64/copyi.asm b/gmp-6.3.0/mpn/sparc64/copyi.asm new file mode 100644 index 0000000..45663dc --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/copyi.asm @@ -0,0 +1,86 @@ +dnl SPARC v9 mpn_copyi -- Copy a limb vector, incrementing. + +dnl Copyright 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 +C UltraSPARC 3: 2.5 +C UltraSPARC T1: 17 +C UltraSPARC T3: 6 +C UltraSPARC T4/T5: 2 + +C INPUT PARAMETERS +C rptr %o0 +C sptr %o1 +C n %o2 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_copyi) + addcc %o2,-8,%o2 + bl,pt %xcc,L(end01234567) + nop +L(loop1): + ldx [%o1+0],%g1 + ldx [%o1+8],%g2 + ldx [%o1+16],%g3 + ldx [%o1+24],%g4 + ldx [%o1+32],%g5 + ldx [%o1+40],%o3 + ldx [%o1+48],%o4 + ldx [%o1+56],%o5 + add %o1,64,%o1 + stx %g1,[%o0+0] + stx %g2,[%o0+8] + stx %g3,[%o0+16] + stx %g4,[%o0+24] + stx %g5,[%o0+32] + stx %o3,[%o0+40] + stx %o4,[%o0+48] + stx %o5,[%o0+56] + addcc %o2,-8,%o2 + bge,pt %xcc,L(loop1) + add %o0,64,%o0 +L(end01234567): + addcc %o2,8,%o2 + bz,pn %xcc,L(end) + nop +L(loop2): + ldx [%o1+0],%g1 + add %o1,8,%o1 + addcc %o2,-1,%o2 + stx %g1,[%o0+0] + bg,pt %xcc,L(loop2) + add %o0,8,%o0 +L(end): retl + nop +EPILOGUE(mpn_copyi) diff --git a/gmp-6.3.0/mpn/sparc64/dive_1.c b/gmp-6.3.0/mpn/sparc64/dive_1.c new file mode 100644 index 0000000..4264f29 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/dive_1.c @@ -0,0 +1,161 @@ +/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/* 64-bit divisor 32-bit divisor + cycles/limb cycles/limb + (approx) (approx) + Ultrasparc 2i: 110 70 +*/ + + +/* There are two key ideas here to reduce mulx's. Firstly when the divisor + is 32-bits the high of q*d can be calculated without the two 32x32->64 + cross-products involving the high 32-bits of the divisor, that being zero + of course. Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save + one mulx (each) knowing the low of q*d is equal to the input limb l. + + For size==1, a simple udivx is used. This is faster than calculating an + inverse. + + For a 32-bit divisor and small sizes, an attempt was made at a simple + udivx loop (two per 64-bit limb), but it turned out to be slower than + mul-by-inverse. At size==2 the inverse is about 260 cycles total + compared to a udivx at 291. Perhaps the latter would suit when size==2 + but the high 32-bits of the second limb is zero (saving one udivx), but + it doesn't seem worth a special case just for that. */ + +void +mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) +{ + mp_limb_t inverse, s, s_next, c, l, ls, q; + unsigned rshift, lshift; + mp_limb_t lshift_mask; + mp_limb_t divisor_h; + + ASSERT (size >= 1); + ASSERT (divisor != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); + ASSERT_MPN (src, size); + ASSERT_LIMB (divisor); + + s = *src++; /* src low limb */ + size--; + if (size == 0) + { + *dst = s / divisor; + return; + } + + if ((divisor & 1) == 0) + { + count_trailing_zeros (rshift, divisor); + divisor >>= rshift; + lshift = 64 - rshift; + + lshift_mask = MP_LIMB_T_MAX; + } + else + { + rshift = 0; + + /* rshift==0 means no shift, so must mask out other part in this case */ + lshift = 0; + lshift_mask = 0; + } + + binvert_limb (inverse, divisor); + + c = 0; + divisor_h = HIGH32 (divisor); + + if (divisor_h == 0) + { + /* 32-bit divisor */ + do + { + s_next = *src++; + ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); + s = s_next; + + SUBC_LIMB (c, l, ls, c); + + q = l * inverse; + *dst++ = q; + + umul_ppmm_half_lowequal (l, q, divisor, l); + c += l; + + size--; + } + while (size != 0); + + ls = s >> rshift; + l = ls - c; + q = l * inverse; + *dst = q; + } + else + { + /* 64-bit divisor */ + mp_limb_t divisor_l = LOW32 (divisor); + do + { + s_next = *src++; + ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); + s = s_next; + + SUBC_LIMB (c, l, ls, c); + + q = l * inverse; + *dst++ = q; + + umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l); + c += l; + + size--; + } + while (size != 0); + + ls = s >> rshift; + l = ls - c; + q = l * inverse; + *dst = q; + } +} diff --git a/gmp-6.3.0/mpn/sparc64/divrem_1.c b/gmp-6.3.0/mpn/sparc64/divrem_1.c new file mode 100644 index 0000000..ac94565 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/divrem_1.c @@ -0,0 +1,242 @@ +/* UltraSparc 64 mpn_divrem_1 -- mpn by limb division. + +Copyright 1991, 1993, 1994, 1996, 1998-2001, 2003 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/* 64-bit divisor 32-bit divisor + cycles/limb cycles/limb + (approx) (approx) + integer fraction integer fraction + Ultrasparc 2i: 160 160 122 96 +*/ + + +/* 32-bit divisors are treated in special case code. This requires 4 mulx + per limb instead of 8 in the general case. + + For big endian systems we need HALF_ENDIAN_ADJ included in the src[i] + addressing, to get the two halves of each limb read in the correct order. + This is kept in an adj variable. Doing that measures about 4 c/l faster + than just writing HALF_ENDIAN_ADJ(i) in the integer loop. The latter + shouldn't be 6 cycles worth of work, but perhaps it doesn't schedule well + (on gcc 3.2.1 at least). The fraction loop doesn't seem affected, but we + still use a variable since that ought to work out best. */ + +mp_limb_t +mpn_divrem_1 (mp_ptr qp_limbptr, mp_size_t xsize_limbs, + mp_srcptr ap_limbptr, mp_size_t size_limbs, mp_limb_t d_limb) +{ + mp_size_t total_size_limbs; + mp_size_t i; + + ASSERT (xsize_limbs >= 0); + ASSERT (size_limbs >= 0); + ASSERT (d_limb != 0); + /* FIXME: What's the correct overlap rule when xsize!=0? */ + ASSERT (MPN_SAME_OR_SEPARATE_P (qp_limbptr + xsize_limbs, + ap_limbptr, size_limbs)); + + total_size_limbs = size_limbs + xsize_limbs; + if (UNLIKELY (total_size_limbs == 0)) + return 0; + + /* udivx is good for total_size==1, and no need to bother checking + limb 0); /* because always even */ + qp[size + HALF_ENDIAN_ADJ(1)] = 0; + } + + /* Skip a division if high < divisor (high quotient 0). Testing + here before before normalizing will still skip as often as + possible. */ + if (n1 < d_limb) + { + r = n1; + size--; + qp[size + HALF_ENDIAN_ADJ(size)] = 0; + total_size--; + if (total_size == 0) + return r; + } + } + + count_leading_zeros_32 (norm, d_limb); + norm -= 32; + d_limb <<= norm; + r <<= norm; + + norm_rshift = 32 - norm; + norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); + + invert_half_limb (dinv_limb, d_limb); + + if (LIKELY (size != 0)) + { + i = size - 1; + adj = HALF_ENDIAN_ADJ (i); + n1 = ap[i + adj]; + adj = -adj; + r |= ((n1 >> norm_rshift) & norm_rmask); + for ( ; i > 0; i--) + { + n0 = ap[i-1 + adj]; + adj = -adj; + nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); + udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb); + qp[i + adj] = q; + n1 = n0; + } + nshift = n1 << norm; + udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb); + qp[0 + HALF_ENDIAN_ADJ(0)] = q; + } + qp -= xsize; + adj = HALF_ENDIAN_ADJ (0); + for (i = xsize-1; i >= 0; i--) + { + udiv_qrnnd_half_preinv (q, r, r, 0, d_limb, dinv_limb); + adj = -adj; + qp[i + adj] = q; + } + + return r >> norm; + } + else + { + mp_srcptr ap; + mp_ptr qp; + mp_size_t size, xsize, total_size; + mp_limb_t d, n1, n0, q, r, dinv, nshift, norm_rmask; + int norm, norm_rshift; + + ap = ap_limbptr; + qp = qp_limbptr; + size = size_limbs; + xsize = xsize_limbs; + total_size = total_size_limbs; + d = d_limb; + + qp += total_size; /* above high limb */ + r = 0; /* initial remainder */ + + if (LIKELY (size != 0)) + { + /* Skip a division if high < divisor (high quotient 0). Testing + here before before normalizing will still skip as often as + possible. */ + n1 = ap[size-1]; + if (n1 < d) + { + r = n1; + *--qp = 0; + total_size--; + if (total_size == 0) + return r; + size--; + } + } + + count_leading_zeros (norm, d); + d <<= norm; + r <<= norm; + + norm_rshift = GMP_LIMB_BITS - norm; + norm_rmask = (norm == 0 ? 0 : ~CNST_LIMB(0)); + + invert_limb (dinv, d); + + if (LIKELY (size != 0)) + { + n1 = ap[size-1]; + r |= ((n1 >> norm_rshift) & norm_rmask); + for (i = size-2; i >= 0; i--) + { + n0 = ap[i]; + nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); + udiv_qrnnd_preinv (q, r, r, nshift, d, dinv); + *--qp = q; + n1 = n0; + } + nshift = n1 << norm; + udiv_qrnnd_preinv (q, r, r, nshift, d, dinv); + *--qp = q; + } + for (i = 0; i < xsize; i++) + { + udiv_qrnnd_preinv (q, r, r, CNST_LIMB(0), d, dinv); + *--qp = q; + } + return r >> norm; + } +} diff --git a/gmp-6.3.0/mpn/sparc64/gcd_11.asm b/gmp-6.3.0/mpn/sparc64/gcd_11.asm new file mode 100644 index 0000000..2dd200d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/gcd_11.asm @@ -0,0 +1,87 @@ +dnl SPARC64 mpn_gcd_11. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for SPARC by Torbjörn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2021 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C UltraSPARC 1&2: 5.1 +C UltraSPARC 3: 5.0 +C UltraSPARC T1: 11.4 +C UltraSPARC T3: 10 +C UltraSPARC T4: 6 +C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1 + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + + RODATA + TYPE(ctz_table,object) +ctz_table: + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') + SIZE(ctz_table,.-ctz_table) + +define(`u0', `%o0') +define(`v0', `%o1') + +ASM_START() +PROLOGUE(mpn_gcd_11) + LEA64(ctz_table, o5, g4) + b L(odd) + mov u0, %o4 + + ALIGN(16) +L(top): movcc %xcc, %o4, v0 C v = min(u,v) + movcc %xcc, %o2, %o0 C u = |v - u] +L(mid): ldub [%o5+%g1], %g5 C + brz,pn %g1, L(shift_alot) C + srlx %o0, %g5, %o4 C new u, odd +L(odd): subcc v0, %o4, %o2 C v - u, set flags for branch and movcc + sub %o4, v0, %o0 C u - v + bnz,pt %xcc, L(top) C + and %o2, MASK, %g1 C extract low MAXSHIFT bits from (v-u) + + retl + mov v0, %o0 + +L(shift_alot): + mov %o4, %o0 + b L(mid) + and %o4, MASK, %g1 C +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/gmp-mparam.h b/gmp-6.3.0/mpn/sparc64/gmp-mparam.h new file mode 100644 index 0000000..5ac2c46 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/gmp-mparam.h @@ -0,0 +1,139 @@ +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 500 MHz ultrasparc2 running GNU/Linux */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1U_TO_MOD_1_1_THRESHOLD 22 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 27 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define USE_PREINV_DIVREM_1 1 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 30 +#define MUL_TOOM33_THRESHOLD 187 +#define MUL_TOOM44_THRESHOLD 278 +#define MUL_TOOM6H_THRESHOLD 278 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 201 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 199 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 107 + +#define SQR_BASECASE_THRESHOLD 13 +#define SQR_TOOM2_THRESHOLD 69 +#define SQR_TOOM3_THRESHOLD 116 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 336 +#define SQR_TOOM8_THRESHOLD 454 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 248 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 248, 5}, { 9, 4}, { 19, 6}, { 5, 5}, \ + { 15, 6}, { 8, 5}, { 17, 6}, { 21, 7}, \ + { 19, 8}, { 11, 7}, { 25, 8}, { 15, 7}, \ + { 31, 8}, { 27, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \ + { 79,11}, { 47,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 50 +#define MUL_FFT_THRESHOLD 1984 + +#define SQR_FFT_MODF_THRESHOLD 236 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 236, 5}, { 8, 4}, { 17, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 19, 7}, { 10, 6}, \ + { 21, 7}, { 21, 8}, { 21, 9}, { 11, 8}, \ + { 23, 9}, { 19, 8}, { 43, 9}, { 23,10}, \ + { 15, 9}, { 43,10}, { 23,11}, { 15,10}, \ + { 31, 9}, { 63,10}, { 47, 8}, { 191,11}, \ + { 31,10}, { 63, 8}, { 255, 7}, { 511, 9}, \ + { 135, 8}, { 271,10}, { 71, 9}, { 143, 8}, \ + { 287, 7}, { 575,11}, { 47, 9}, { 191, 8}, \ + { 383,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 49 +#define SQR_FFT_THRESHOLD 1120 + +#define MULLO_BASECASE_THRESHOLD 16 +#define MULLO_DC_THRESHOLD 41 +#define MULLO_MUL_N_THRESHOLD 3791 + +#define DC_DIV_QR_THRESHOLD 27 +#define DC_DIVAPPR_Q_THRESHOLD 100 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 174 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 13 +#define INV_APPR_THRESHOLD 9 + +#define BINV_NEWTON_THRESHOLD 187 +#define REDC_1_TO_REDC_2_THRESHOLD 10 +#define REDC_2_TO_REDC_N_THRESHOLD 115 + +#define MU_DIV_QR_THRESHOLD 680 +#define MU_DIVAPPR_Q_THRESHOLD 618 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 748 +#define MU_BDIV_Q_THRESHOLD 889 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 53 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 186 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 390 +#define SET_STR_PRECOMPUTE_THRESHOLD 1665 diff --git a/gmp-6.3.0/mpn/sparc64/lshift.asm b/gmp-6.3.0/mpn/sparc64/lshift.asm new file mode 100644 index 0000000..90bbb45 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/lshift.asm @@ -0,0 +1,140 @@ +dnl SPARC v9 mpn_lshift + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 +C UltraSPARC 3: 2.5 +C UltraSPARC T1: 17.5 +C UltraSPARC T3: 8 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt', `%i3') + +define(`tcnt', `%i4') +define(`retval', `%i5') +define(`u0', `%l0') +define(`u1', `%l1') +define(`r0', `%l6') +define(`r1', `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshift) + save %sp, -176, %sp + + sllx n, 3, n + sub %g0, cnt, tcnt + + sub up, 8, u1_off + add rp, (5 * 8), r1_off + + ldx [n + u1_off], u1 C WAS: up - 8 + add u1_off, (3 * 8), u1_off + + sub r1_off, 8, r0_off + sub u1_off, 8, u0_off + + subcc n, (3 * 8), n + srlx u1, tcnt, retval + + bl,pn %xcc, L(end12) + sllx u1, cnt, %l3 + + ldx [n + u0_off], u0 C WAS: up - 16 + subcc n, (2 * 8), n + + ldx [n + u1_off], u1 C WAS: up - 24 + + bl,pn %xcc, L(end34) + srlx u0, tcnt, %l4 + + b,a L(top) + ALIGN(16) +L(top): + sllx u0, cnt, %l2 + or %l4, %l3, r0 + + ldx [n + u0_off], u0 C WAS: up - 16 + srlx u1, tcnt, %l5 + + stx r0, [n + r0_off] C WAS: rp - 8 + subcc n, (2 * 8), n + + sllx u1, cnt, %l3 + or %l2, %l5, r1 + + ldx [n + u1_off], u1 C WAS: up - 24 + srlx u0, tcnt, %l4 + + bge,pt %xcc, L(top) + stx r1, [n + r1_off] C WAS: rp - 16 + +L(end34): + sllx u0, cnt, %l2 + or %l4, %l3, r0 + + srlx u1, tcnt, %l5 + stx r0, [n + r0_off] C WAS: rp - 8 + + or %l2, %l5, r1 + sub n, (2 * 8), %o5 + + sllx u1, cnt, %l3 + stx r1, [%o5 + r1_off] C WAS: rp - 16 + +L(end12): + andcc n, 8, %g0 + bz,pn %xcc, L(done) + nop + + ldx [n + u0_off], u1 + srlx u1, tcnt, %l4 + or %l4, %l3, r0 + stx r0, [r0_off - 24] + sllx u1, cnt, %l3 +L(done): + stx %l3, [r0_off - 32] + + ret + restore retval, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/lshiftc.asm b/gmp-6.3.0/mpn/sparc64/lshiftc.asm new file mode 100644 index 0000000..4a0f0a3 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/lshiftc.asm @@ -0,0 +1,147 @@ +dnl SPARC v9 mpn_lshiftc + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 3 +C UltraSPARC 3: 3 +C UltraSPARC T1: 17 +C UltraSPARC T3: 10 +C UltraSPARC T4: 3.5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt', `%i3') + +define(`tcnt', `%i4') +define(`retval', `%i5') +define(`u0', `%l0') +define(`u1', `%l1') +define(`r0', `%l6') +define(`r1', `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshiftc) + save %sp, -176, %sp + + sllx n, 3, n + sub %g0, cnt, tcnt + + sub up, 8, u1_off + add rp, (5 * 8), r1_off + + ldx [n + u1_off], u1 C WAS: up - 8 + add u1_off, (3 * 8), u1_off + + sub r1_off, 8, r0_off + sub u1_off, 8, u0_off + + subcc n, (3 * 8), n + srlx u1, tcnt, retval + + bl,pn %xcc, L(end12) + sllx u1, cnt, %l3 + + ldx [n + u0_off], u0 C WAS: up - 16 + subcc n, (2 * 8), n + + ldx [n + u1_off], u1 C WAS: up - 24 + + bl,pn %xcc, L(end34) + srlx u0, tcnt, %l4 + + b,a L(top) + ALIGN(16) +L(top): + not %l3, %l3 + sllx u0, cnt, %l2 + + andn %l3, %l4, r0 + ldx [n + u0_off], u0 C WAS: up - 16 + + srlx u1, tcnt, %l5 + stx r0, [n + r0_off] C WAS: rp - 8 + + subcc n, (2 * 8), n + not %l2, %l2 + + sllx u1, cnt, %l3 + andn %l2, %l5, r1 + + ldx [n + u1_off], u1 C WAS: up - 24 + srlx u0, tcnt, %l4 + + bge,pt %xcc, L(top) + stx r1, [n + r1_off] C WAS: rp - 16 + +L(end34): + not %l3, %l3 + sllx u0, cnt, %l2 + + andn %l3, %l4, r0 + srlx u1, tcnt, %l5 + + stx r0, [n + r0_off] C WAS: rp - 8 + not %l2, %l2 + + andn %l2, %l5, r1 + sub n, (2 * 8), %o5 + + sllx u1, cnt, %l3 + stx r1, [%o5 + r1_off] C WAS: rp - 16 + +L(end12): + andcc n, 8, %g0 + bz %xcc, L(done)+4 + not %l3, %l3 + + ldx [n + u0_off], u1 + srlx u1, tcnt, %l4 + andn %l3, %l4, r0 + stx r0, [r0_off - 24] + sllx u1, cnt, %l3 +L(done): + not %l3, %l3 + stx %l3, [r0_off - 32] + + ret + restore retval, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/mod_1.c b/gmp-6.3.0/mpn/sparc64/mod_1.c new file mode 100644 index 0000000..ab53f9d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/mod_1.c @@ -0,0 +1,238 @@ +/* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder. + +Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/* 64-bit divisor 32-bit divisor + cycles/limb cycles/limb + (approx) (approx) + Ultrasparc 2i: 160 120 +*/ + + +/* 32-bit divisors are treated in special case code. This requires 4 mulx + per limb instead of 8 in the general case. + + For big endian systems we need HALF_ENDIAN_ADJ included in the src[i] + addressing, to get the two halves of each limb read in the correct order. + This is kept in an adj variable. Doing that measures about 6 c/l faster + than just writing HALF_ENDIAN_ADJ(i) in the loop. The latter shouldn't + be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc + 3.2.1 at least). + + A simple udivx/umulx loop for the 32-bit case was attempted for small + sizes, but at size==2 it was only about the same speed and at size==3 was + slower. */ + +static mp_limb_t +mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb) +{ + int norm, norm_rshift; + mp_limb_t src_high_limb; + mp_size_t i; + + ASSERT (size_limbs >= 0); + ASSERT (d_limb != 0); + + if (UNLIKELY (size_limbs == 0)) + return 0; + + src_high_limb = src_limbptr[size_limbs-1]; + + /* udivx is good for size==1, and no need to bother checking limb> 32; + + /* If the length of the source is uniformly distributed, then there's + a 50% chance of the high 32-bits being zero, which we can skip. */ + if (r == 0) + { + r = (unsigned) src_high_limb; + size--; + ASSERT (size > 0); /* because always even */ + } + + /* Skip a division if high < divisor. Having the test here before + normalizing will still skip as often as possible. */ + if (r < d_limb) + { + size--; + ASSERT (size > 0); /* because size==1 handled above */ + } + else + r = 0; + + count_leading_zeros_32 (norm, d_limb); + norm -= 32; + d_limb <<= norm; + + norm_rshift = 32 - norm; + norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); + i = size-1; + adj = HALF_ENDIAN_ADJ (i); + n1 = src [i + adj]; + r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); + + invert_half_limb (dinv_limb, d_limb); + adj = -adj; + + for (i--; i >= 0; i--) + { + n0 = src [i + adj]; + adj = -adj; + nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); + udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); + n1 = n0; + } + + /* same as loop, but without n0 */ + nshift = n1 << norm; + udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); + + ASSERT ((r & ((1 << norm) - 1)) == 0); + return r >> norm; + } + else + { + mp_srcptr src; + mp_size_t size; + mp_limb_t n1, n0, r, dinv, dummy_q, nshift, norm_rmask; + + src = src_limbptr; + size = size_limbs; + r = src_high_limb; /* initial remainder */ + + /* Skip a division if high < divisor. Having the test here before + normalizing will still skip as often as possible. */ + if (r < d_limb) + { + size--; + ASSERT (size > 0); /* because size==1 handled above */ + } + else + r = 0; + + count_leading_zeros (norm, d_limb); + d_limb <<= norm; + + norm_rshift = GMP_LIMB_BITS - norm; + norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); + + src += size; + n1 = *--src; + r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); + + invert_limb (dinv, d_limb); + + for (i = size-2; i >= 0; i--) + { + n0 = *--src; + nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); + udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); + n1 = n0; + } + + /* same as loop, but without n0 */ + nshift = n1 << norm; + udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); + + ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0); + return r >> norm; + } +} + +mp_limb_t +mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b) +{ + ASSERT (n >= 0); + ASSERT (b != 0); + + /* Should this be handled at all? Rely on callers? Note un==0 is currently + required by mpz/fdiv_r_ui.c and possibly other places. */ + if (n == 0) + return 0; + + if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0)) + { + if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD)) + { + return mpn_mod_1_anynorm (ap, n, b); + } + else + { + mp_limb_t pre[4]; + mpn_mod_1_1p_cps (pre, b); + return mpn_mod_1_1p (ap, n, b, pre); + } + } + else + { + if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD)) + { + return mpn_mod_1_anynorm (ap, n, b); + } + else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD)) + { + mp_limb_t pre[4]; + mpn_mod_1_1p_cps (pre, b); + return mpn_mod_1_1p (ap, n, b << pre[1], pre); + } + else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4)) + { + mp_limb_t pre[5]; + mpn_mod_1s_2p_cps (pre, b); + return mpn_mod_1s_2p (ap, n, b << pre[1], pre); + } + else + { + mp_limb_t pre[7]; + mpn_mod_1s_4p_cps (pre, b); + return mpn_mod_1s_4p (ap, n, b << pre[1], pre); + } + } +} diff --git a/gmp-6.3.0/mpn/sparc64/mod_1_4.c b/gmp-6.3.0/mpn/sparc64/mod_1_4.c new file mode 100644 index 0000000..735a402 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/mod_1_4.c @@ -0,0 +1,235 @@ +/* mpn_mod_1s_4p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + Requires that d < B / 4. + + Contributed to the GNU project by Torbjorn Granlund. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + +void +mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; + int cnt; + + ASSERT (b <= (~(mp_limb_t) 0) / 4); + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; + + udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); + cps[4] = B3modb >> cnt; + + udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi); + cps[5] = B4modb >> cnt; + + udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi); + cps[6] = B5modb >> cnt; + +#if WANT_ASSERT + { + int i; + b = cps[2]; + for (i = 3; i <= 6; i++) + { + b += cps[i]; + ASSERT (b >= cps[i]); + } + } +#endif +} + +mp_limb_t +mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7]) +{ + mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; + mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; + mp_size_t i; + int cnt; + + ASSERT (n >= 1); + + B1modb = cps[2]; + B2modb = cps[3]; + B3modb = cps[4]; + B4modb = cps[5]; + B5modb = cps[6]; + + if ((b >> 32) == 0) + { + switch (n & 3) + { + case 0: + umul_ppmm_s (ph, pl, ap[n - 3], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]); + umul_ppmm_s (ch, cl, ap[n - 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + umul_ppmm_s (rh, rl, ap[n - 1], B3modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 4; + break; + case 1: + rh = 0; + rl = ap[n - 1]; + n -= 1; + break; + case 2: + rh = ap[n - 1]; + rl = ap[n - 2]; + n -= 2; + break; + case 3: + umul_ppmm_s (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); + umul_ppmm_s (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 3; + break; + } + + for (i = n - 4; i >= 0; i -= 4) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + ap[i+2] * (B^2 mod b) <= (B-1)(b-1) + + ap[i+3] * (B^3 mod b) <= (B-1)(b-1) + + LO(rr) * (B^4 mod b) <= (B-1)(b-1) + + HI(rr) * (B^5 mod b) <= (B-1)(b-1) + */ + umul_ppmm_s (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + + umul_ppmm_s (ch, cl, ap[i + 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm_s (ch, cl, ap[i + 3], B3modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm_s (ch, cl, rl, B4modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm_s (rh, rl, rh, B5modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm_s (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); + } + else + { + switch (n & 3) + { + case 0: + umul_ppmm (ph, pl, ap[n - 3], B1modb); + add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]); + umul_ppmm (ch, cl, ap[n - 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + umul_ppmm (rh, rl, ap[n - 1], B3modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 4; + break; + case 1: + rh = 0; + rl = ap[n - 1]; + n -= 1; + break; + case 2: + rh = ap[n - 1]; + rl = ap[n - 2]; + n -= 2; + break; + case 3: + umul_ppmm (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]); + umul_ppmm (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 3; + break; + } + + for (i = n - 4; i >= 0; i -= 4) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + ap[i+2] * (B^2 mod b) <= (B-1)(b-1) + + ap[i+3] * (B^3 mod b) <= (B-1)(b-1) + + LO(rr) * (B^4 mod b) <= (B-1)(b-1) + + HI(rr) * (B^5 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]); + + umul_ppmm (ch, cl, ap[i + 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, ap[i + 3], B3modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, rl, B4modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (rh, rl, rh, B5modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, 0, cl); + } + + bi = cps[0]; + cnt = cps[1]; + + r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + + return r >> cnt; +} diff --git a/gmp-6.3.0/mpn/sparc64/mode1o.c b/gmp-6.3.0/mpn/sparc64/mode1o.c new file mode 100644 index 0000000..771c999 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/mode1o.c @@ -0,0 +1,196 @@ +/* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/sparc64/sparc64.h" + + +/* 64-bit divisor 32-bit divisor + cycles/limb cycles/limb + (approx) (approx) + Ultrasparc 2i: ? ? +*/ + + +/* This implementation reduces the number of multiplies done, knowing that + on ultrasparc 1 and 2 the mulx instruction stalls the whole chip. + + The key idea is to use the fact that the low limb of q*d equals l, this + being the whole purpose of the q calculated. It means there's no need to + calculate the lowest 32x32->64 part of the q*d, instead it can be + inferred from l and the other three 32x32->64 parts. See sparc64.h for + details. + + When d is 32-bits, the same applies, but in this case there's only one + other 32x32->64 part (ie. HIGH(q)*d). + + The net effect is that for 64-bit divisor each limb is 4 mulx, or for + 32-bit divisor each is 2 mulx. + + Enhancements: + + No doubt this could be done in assembler, if that helped the scheduling, + or perhaps guaranteed good code irrespective of the compiler. + + Alternatives: + + It might be possibly to use floating point. The loop is dominated by + multiply latency, so not sure if floats would improve that. One + possibility would be to take two limbs at a time, with a 128 bit inverse, + if there's enough registers, which could effectively use float throughput + to reduce total latency across two limbs. */ + +#define ASSERT_RETVAL(r) \ + ASSERT (orig_c < d ? r < d : r <= d) + +mp_limb_t +mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c) +{ + mp_limb_t c = orig_c; + mp_limb_t s, l, q, h, inverse; + + ASSERT (size >= 1); + ASSERT (d & 1); + ASSERT_MPN (src, size); + ASSERT_LIMB (d); + ASSERT_LIMB (c); + + /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */ + if (size == 1) + { + s = src[0]; + if (s > c) + { + l = s-c; + h = l % d; + if (h != 0) + h = d - h; + } + else + { + l = c-s; + h = l % d; + } + return h; + } + + binvert_limb (inverse, d); + + if (d <= 0xFFFFFFFF) + { + s = *src++; + size--; + do + { + SUBC_LIMB (c, l, s, c); + s = *src++; + q = l * inverse; + umul_ppmm_half_lowequal (h, q, d, l); + c += h; + size--; + } + while (size != 0); + + if (s <= d) + { + /* With high s <= d the final step can be a subtract and addback. + If c==0 then the addback will restore to l>=0. If c==d then + will get l==d if s==0, but that's ok per the function + definition. */ + + l = c - s; + l += (l > c ? d : 0); + + ASSERT_RETVAL (l); + return l; + } + else + { + /* Can't skip a divide, just do the loop code once more. */ + SUBC_LIMB (c, l, s, c); + q = l * inverse; + umul_ppmm_half_lowequal (h, q, d, l); + c += h; + + ASSERT_RETVAL (c); + return c; + } + } + else + { + mp_limb_t dl = LOW32 (d); + mp_limb_t dh = HIGH32 (d); + long i; + + s = *src++; + size--; + do + { + SUBC_LIMB (c, l, s, c); + s = *src++; + q = l * inverse; + umul_ppmm_lowequal (h, q, d, dh, dl, l); + c += h; + size--; + } + while (size != 0); + + if (s <= d) + { + /* With high s <= d the final step can be a subtract and addback. + If c==0 then the addback will restore to l>=0. If c==d then + will get l==d if s==0, but that's ok per the function + definition. */ + + l = c - s; + l += (l > c ? d : 0); + + ASSERT_RETVAL (l); + return l; + } + else + { + /* Can't skip a divide, just do the loop code once more. */ + SUBC_LIMB (c, l, s, c); + q = l * inverse; + umul_ppmm_lowequal (h, q, d, dh, dl, l); + c += h; + + ASSERT_RETVAL (c); + return c; + } + } +} diff --git a/gmp-6.3.0/mpn/sparc64/rshift.asm b/gmp-6.3.0/mpn/sparc64/rshift.asm new file mode 100644 index 0000000..3f8e11f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/rshift.asm @@ -0,0 +1,142 @@ +dnl SPARC v9 mpn_rshift + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 +C UltraSPARC 3: 2.5 +C UltraSPARC T1: 17.5 +C UltraSPARC T3: 8 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt', `%i3') + +define(`tcnt', `%i4') +define(`retval', `%i5') +define(`u0', `%l0') +define(`u1', `%l1') +define(`r0', `%l6') +define(`r1', `%l7') +define(`u0_off', `%o0') +define(`u1_off', `%o1') +define(`r0_off', `%o2') +define(`r1_off', `%o3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_rshift) + save %sp, -176, %sp + + sllx n, 3, n + sub %g0, cnt, tcnt + + add up, n, up + add rp, n, rp + + neg n, n + sub up, (2 * 8), u0_off + sub rp, (5 * 8), r0_off + + ldx [n + up], u1 C WAS: up + 0 + sub u0_off, (1 * 8), u1_off + sub r0_off, (1 * 8), r1_off + + subcc n, -(3 * 8), n + sllx u1, tcnt, retval + + bg,pn %xcc, L(end12) + srlx u1, cnt, %l3 + + ldx [n + u0_off], u0 C WAS: up + 0 + subcc n, -(2 * 8), n + + ldx [n + u1_off], u1 C WAS: up + 8 + + bg,pn %xcc, L(end34) + sllx u0, tcnt, %l4 + + b,a L(top) + ALIGN(16) +L(top): + srlx u0, cnt, %l2 + or %l3, %l4, r0 + + ldx [n + u0_off], u0 C WAS: up + 0 + sllx u1, tcnt, %l5 + + stx r0, [n + r0_off] C WAS: rp + 0 + subcc n, -(2 * 8), n + + srlx u1, cnt, %l3 + or %l2, %l5, r1 + + ldx [n + u1_off], u1 C WAS: up + 8 + sllx u0, tcnt, %l4 + + ble,pt %xcc, L(top) + stx r1, [n + r1_off] C WAS: rp + 8 + +L(end34): + srlx u0, cnt, %l2 + or %l3, %l4, r0 + + sllx u1, tcnt, %l5 + stx r0, [n + r0_off] C WAS: rp + 0 + + or %l2, %l5, r1 + sub n, -(2 * 8), %o5 + + srlx u1, cnt, %l3 + stx r1, [%o5 + r1_off] C WAS: rp + 8 + +L(end12): + andcc n, 8, %g0 + bz,pn %xcc, L(done) + nop + + ldx [n + u0_off], u1 + sllx u1, tcnt, %l4 + or %l3, %l4, r0 + stx r0, [r0_off + 24] + srlx u1, cnt, %l3 +L(done): + stx %l3, [r0_off + 32] + + ret + restore retval, 0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm b/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm new file mode 100644 index 0000000..22e0dc5 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/sec_tabselect.asm @@ -0,0 +1,162 @@ +dnl SPARC v9 mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund and David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 2 hopefully +C UltraSPARC 3: 3 +C UltraSPARC T1: 17 +C UltraSPARC T3: ? +C UltraSPARC T4/T5: 2.25 hopefully + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`tp', `%i1') +define(`n', `%i2') +define(`nents', `%i3') +define(`which', `%i4') + +define(`i', `%g1') +define(`j', `%g3') +define(`stride', `%g4') +define(`tporig', `%g5') +define(`mask', `%o0') + +define(`data0', `%l0') +define(`data1', `%l1') +define(`data2', `%l2') +define(`data3', `%l3') +define(`t0', `%l4') +define(`t1', `%l5') +define(`t2', `%l6') +define(`t3', `%l7') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sec_tabselect) + save %sp, -176, %sp + + sllx n, 3, stride + sub n, 4, j + brlz j, L(outer_end) + mov tp, tporig + +L(outer_loop): + clr data0 + clr data1 + clr data2 + clr data3 + mov tporig, tp + mov nents, i + mov which, %o1 + +L(top): subcc %o1, 1, %o1 C set carry iff o1 = 0 + ldx [tp + 0], t0 + subc %g0, %g0, mask + ldx [tp + 8], t1 + sub i, 1, i + ldx [tp + 16], t2 + ldx [tp + 24], t3 + add tp, stride, tp + and t0, mask, t0 + and t1, mask, t1 + or t0, data0, data0 + and t2, mask, t2 + or t1, data1, data1 + and t3, mask, t3 + or t2, data2, data2 + brnz i, L(top) + or t3, data3, data3 + + stx data0, [rp + 0] + subcc j, 4, j + stx data1, [rp + 8] + stx data2, [rp + 16] + stx data3, [rp + 24] + add tporig, (4 * 8), tporig + + brgez j, L(outer_loop) + add rp, (4 * 8), rp +L(outer_end): + + + andcc n, 2, %g0 + be L(b0x) + nop +L(b1x): clr data0 + clr data1 + mov tporig, tp + mov nents, i + mov which, %o1 + +L(tp2): subcc %o1, 1, %o1 + ldx [tp + 0], t0 + subc %g0, %g0, mask + ldx [tp + 8], t1 + sub i, 1, i + add tp, stride, tp + and t0, mask, t0 + and t1, mask, t1 + or t0, data0, data0 + brnz i, L(tp2) + or t1, data1, data1 + + stx data0, [rp + 0] + stx data1, [rp + 8] + add tporig, (2 * 8), tporig + add rp, (2 * 8), rp + + +L(b0x): andcc n, 1, %g0 + be L(b00) + nop +L(b01): clr data0 + mov tporig, tp + mov nents, i + mov which, %o1 + +L(tp1): subcc %o1, 1, %o1 + ldx [tp + 0], t0 + subc %g0, %g0, mask + sub i, 1, i + add tp, stride, tp + and t0, mask, t0 + brnz i, L(tp1) + or t0, data0, data0 + + stx data0, [rp + 0] + +L(b00): ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/sparc64.h b/gmp-6.3.0/mpn/sparc64/sparc64.h new file mode 100644 index 0000000..8698a82 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/sparc64.h @@ -0,0 +1,217 @@ +/* UltraSPARC 64 support macros. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define LOW32(x) ((x) & 0xFFFFFFFF) +#define HIGH32(x) ((x) >> 32) + + +/* Halfword number i in src is accessed as src[i+HALF_ENDIAN_ADJ(i)]. + Plain src[i] would be incorrect in big endian, HALF_ENDIAN_ADJ has the + effect of swapping the two halves in this case. */ +#if HAVE_LIMB_BIG_ENDIAN +#define HALF_ENDIAN_ADJ(i) (1 - (((i) & 1) << 1)) /* +1 even, -1 odd */ +#endif +#if HAVE_LIMB_LITTLE_ENDIAN +#define HALF_ENDIAN_ADJ(i) 0 /* no adjust */ +#endif +#ifndef HALF_ENDIAN_ADJ +Error, error, unknown limb endianness; +#endif + + +/* umul_ppmm_lowequal sets h to the high limb of q*d, assuming the low limb + of that product is equal to l. dh and dl are the 32-bit halves of d. + + |-----high----||----low-----| + +------+------+ + | | ph = qh * dh + +------+------+ + +------+------+ + | | pm1 = ql * dh + +------+------+ + +------+------+ + | | pm2 = qh * dl + +------+------+ + +------+------+ + | | pl = ql * dl (not calculated) + +------+------+ + + Knowing that the low 64 bits is equal to l means that LOW(pm1) + LOW(pm2) + + HIGH(pl) == HIGH(l). The only thing we need from those product parts + is whether they produce a carry into the high. + + pm_l = LOW(pm1)+LOW(pm2) is done to contribute its carry, then the only + time there's a further carry from LOW(pm_l)+HIGH(pl) is if LOW(pm_l) > + HIGH(l). pl is never actually calculated. */ + +#define umul_ppmm_lowequal(h, q, d, dh, dl, l) \ + do { \ + mp_limb_t ql, qh, ph, pm1, pm2, pm_l; \ + ASSERT (dh == HIGH32(d)); \ + ASSERT (dl == LOW32(d)); \ + ASSERT (q*d == l); \ + \ + ql = LOW32 (q); \ + qh = HIGH32 (q); \ + \ + pm1 = ql * dh; \ + pm2 = qh * dl; \ + ph = qh * dh; \ + \ + pm_l = LOW32 (pm1) + LOW32 (pm2); \ + \ + (h) = ph + HIGH32 (pm1) + HIGH32 (pm2) \ + + HIGH32 (pm_l) + ((pm_l << 32) > l); \ + \ + ASSERT_HIGH_PRODUCT (h, q, d); \ + } while (0) + + +/* Set h to the high of q*d, assuming the low limb of that product is equal + to l, and that d fits in 32-bits. + + |-----high----||----low-----| + +------+------+ + | | pm = qh * dl + +------+------+ + +------+------+ + | | pl = ql * dl (not calculated) + +------+------+ + + Knowing that LOW(pm) + HIGH(pl) == HIGH(l) (mod 2^32) means that the only + time there's a carry from that sum is when LOW(pm) > HIGH(l). There's no + need to calculate pl to determine this. */ + +#define umul_ppmm_half_lowequal(h, q, d, l) \ + do { \ + mp_limb_t pm; \ + ASSERT (q*d == l); \ + ASSERT (HIGH32(d) == 0); \ + \ + pm = HIGH32(q) * d; \ + (h) = HIGH32(pm) + ((pm << 32) > l); \ + ASSERT_HIGH_PRODUCT (h, q, d); \ + } while (0) + + +/* check that h is the high limb of x*y */ +#if WANT_ASSERT +#define ASSERT_HIGH_PRODUCT(h, x, y) \ + do { \ + mp_limb_t want_h, dummy; \ + umul_ppmm (want_h, dummy, x, y); \ + ASSERT (h == want_h); \ + } while (0) +#else +#define ASSERT_HIGH_PRODUCT(h, q, d) \ + do { } while (0) +#endif + + +/* Multiply u anv v, where v < 2^32. */ +#define umul_ppmm_s(w1, w0, u, v) \ + do { \ + UWtype __x0, __x2; \ + UWtype __ul, __vl, __uh; \ + UWtype __u = (u), __v = (v); \ + \ + __ul = __ll_lowpart (__u); \ + __uh = __ll_highpart (__u); \ + __vl = __ll_lowpart (__v); \ + \ + __x0 = (UWtype) __ul * __vl; \ + __x2 = (UWtype) __uh * __vl; \ + \ + (w1) = (__x2 + (__x0 >> W_TYPE_SIZE/2)) >> W_TYPE_SIZE/2; \ + (w0) = (__x2 << W_TYPE_SIZE/2) + __x0; \ + } while (0) + +/* Count the leading zeros on a limb, but assuming it fits in 32 bits. + The count returned will be in the range 32 to 63. + This is the 32-bit generic C count_leading_zeros from longlong.h. */ +#define count_leading_zeros_32(count, x) \ + do { \ + mp_limb_t __xr = (x); \ + unsigned __a; \ + ASSERT ((x) != 0); \ + ASSERT ((x) <= CNST_LIMB(0xFFFFFFFF)); \ + __a = __xr < ((UWtype) 1 << 16) ? (__xr < ((UWtype) 1 << 8) ? 1 : 8 + 1) \ + : (__xr < ((UWtype) 1 << 24) ? 16 + 1 : 24 + 1); \ + \ + (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \ + } while (0) + + +/* Set inv to a 32-bit inverse floor((b*(b-d)-1) / d), knowing that d fits + 32 bits and is normalized (high bit set). */ +#define invert_half_limb(inv, d) \ + do { \ + mp_limb_t _n; \ + ASSERT ((d) <= 0xFFFFFFFF); \ + ASSERT ((d) & 0x80000000); \ + _n = (((mp_limb_t) -(d)) << 32) - 1; \ + (inv) = (mp_limb_t) (unsigned) (_n / (d)); \ + } while (0) + + +/* Divide nh:nl by d, setting q to the quotient and r to the remainder. + q, r, nh and nl are 32-bits each, d_limb is 32-bits but in an mp_limb_t, + dinv_limb is similarly a 32-bit inverse but in an mp_limb_t. */ + +#define udiv_qrnnd_half_preinv(q, r, nh, nl, d_limb, dinv_limb) \ + do { \ + unsigned _n2, _n10, _n1, _nadj, _q11n, _xh, _r, _q; \ + mp_limb_t _n, _x; \ + ASSERT (d_limb <= 0xFFFFFFFF); \ + ASSERT (dinv_limb <= 0xFFFFFFFF); \ + ASSERT (d_limb & 0x80000000); \ + ASSERT (nh < d_limb); \ + _n10 = (nl); \ + _n2 = (nh); \ + _n1 = (int) _n10 >> 31; \ + _nadj = _n10 + (_n1 & d_limb); \ + _x = dinv_limb * (_n2 - _n1) + _nadj; \ + _q11n = ~(_n2 + HIGH32 (_x)); /* -q1-1 */ \ + _n = ((mp_limb_t) _n2 << 32) + _n10; \ + _x = _n + d_limb * _q11n; /* n-q1*d-d */ \ + _xh = HIGH32 (_x) - d_limb; /* high(n-q1*d-d) */ \ + ASSERT (_xh == 0 || _xh == ~0); \ + _r = _x + (d_limb & _xh); /* addback */ \ + _q = _xh - _q11n; /* q1+1-addback */ \ + ASSERT (_r < d_limb); \ + ASSERT (d_limb * _q + _r == _n); \ + (r) = _r; \ + (q) = _q; \ + } while (0) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm new file mode 100644 index 0000000..92374d2 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/add_n.asm @@ -0,0 +1,241 @@ +dnl SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 4 +C UltraSPARC 3: 4.5 + +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u+v+carry_in, using logic operations. + +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') + +define(`u0', `%l0') +define(`u1', `%l2') +define(`u2', `%l4') +define(`u3', `%l6') +define(`v0', `%l1') +define(`v1', `%l3') +define(`v2', `%l5') +define(`v3', `%l7') + +define(`cy',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + nop + b,a L(com) +EPILOGUE() + +PROLOGUE(mpn_add_n) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + mov 0,cy +L(com): + ldx [up+0],u0 + ldx [vp+0],v0 + add up,32,up + ldx [up-24],u1 + ldx [vp+8],v1 + add vp,32,vp + ldx [up-16],u2 + ldx [vp-16],v2 + ldx [up-8],u3 + ldx [vp-8],v3 + subcc n,8,n + add u0,v0,%g1 C main add + add %g1,cy,%g5 C carry add + or u0,v0,%g2 + bl,pn %xcc,.Lend4567 + fanop + b,a .Loop + + .align 16 +C START MAIN LOOP +.Loop: andn %g2,%g5,%g2 + and u0,v0,%g3 + ldx [up+0],u0 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp+0],v0 + add up,32,up + fanop +C -- + srlx %g2,63,cy + add u1,v1,%g1 + stx %g5,[rp+0] + fanop +C -- + add %g1,cy,%g5 + or u1,v1,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u1,v1,%g3 + ldx [up-24],u1 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp+8],v1 + add vp,32,vp + fanop +C -- + srlx %g2,63,cy + add u2,v2,%g1 + stx %g5,[rp+8] + fanop +C -- + add %g1,cy,%g5 + or u2,v2,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u2,v2,%g3 + ldx [up-16],u2 + fanop +C -- + or %g3,%g2,%g2 + ldx [vp-16],v2 + add rp,32,rp + fanop +C -- + srlx %g2,63,cy + add u3,v3,%g1 + stx %g5,[rp-16] + fanop +C -- + add %g1,cy,%g5 + or u3,v3,%g2 + fmnop + fanop +C -- + andn %g2,%g5,%g2 + and u3,v3,%g3 + ldx [up-8],u3 + fanop +C -- + or %g3,%g2,%g2 + subcc n,4,n + ldx [vp-8],v3 + fanop +C -- + srlx %g2,63,cy + add u0,v0,%g1 + stx %g5,[rp-8] + fanop +C -- + add %g1,cy,%g5 + or u0,v0,%g2 + bge,pt %xcc,.Loop + fanop +C END MAIN LOOP +.Lend4567: + andn %g2,%g5,%g2 + and u0,v0,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + add u1,v1,%g1 + stx %g5,[rp+0] + add %g1,cy,%g5 + or u1,v1,%g2 + andn %g2,%g5,%g2 + and u1,v1,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + add u2,v2,%g1 + stx %g5,[rp+8] + add %g1,cy,%g5 + or u2,v2,%g2 + andn %g2,%g5,%g2 + and u2,v2,%g3 + or %g3,%g2,%g2 + add rp,32,rp + srlx %g2,63,cy + add u3,v3,%g1 + stx %g5,[rp-16] + add %g1,cy,%g5 + or u3,v3,%g2 + andn %g2,%g5,%g2 + and u3,v3,%g3 + or %g3,%g2,%g2 + srlx %g2,63,cy + stx %g5,[rp-8] + + addcc n,4,n + bz,pn %xcc,.Lret + fanop + +.Loop0: ldx [up],u0 + add up,8,up + ldx [vp],v0 + add vp,8,vp + add rp,8,rp + subcc n,1,n + add u0,v0,%g1 + or u0,v0,%g2 + add %g1,cy,%g5 + and u0,v0,%g3 + andn %g2,%g5,%g2 + stx %g5,[rp-8] + or %g3,%g2,%g2 + bnz,pt %xcc,.Loop0 + srlx %g2,63,cy + +.Lret: mov cy,%i0 + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm new file mode 100644 index 0000000..48a9414 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_1.asm @@ -0,0 +1,606 @@ +dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1998, 2000-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 14 +C UltraSPARC 3: 17.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the up operand split +C into 32-bit pieces. We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C 0. Rewrite to use algorithm of mpn_addmul_2. +C 1. Align the stack area where we transfer the four 49-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before up?) +C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C develop mpn_addmul_2. This would save many integer instructions. +C 3. Unrolling. Questionable if it is worth the code expansion, given that +C it could only save 1 cycle/limb. +C 4. Specialize for particular v values. If its upper 32 bits are zero, we +C could save many operations, in the FPU (fmuld), but more so in the IEU +C since we'll be summing 48-bit quantities, which might be simpler. +C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should +C not be greater than needed for L2 cache latency, and also not so great +C that i16 needs to be copied. +C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU +C ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C 8 FM +C 10 FA +C 12 MEM +C 10 ISHIFT + 14 IADDLOG +C 1 BRANCH +C 55 insns totally (plus one mov insn that should be optimized out) + +C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain the peak execution rate of 4 instructions/cycle. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_addmul_1) + +C Initialization. (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. + + save %sp, -256, %sp + mov -1, %g4 + srlx %g4, 48, xffff C store mask in register `xffff' + and %i3, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %i3, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %i3, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %i3, 48, %g3 + stx %g3, [%sp+2223+24] + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + sllx %i2, 3, %i2 + mov 0, cy C clear cy + add %i0, %i2, %i0 + add %i1, %i2, %i1 + neg %i2 + add %i1, 4, %i5 + add %i0, -32, %i4 + add %i0, -16, %i0 + + ldd [%sp+2223+0], v00 + ldd [%sp+2223+8], v16 + ldd [%sp+2223+16], v32 + ldd [%sp+2223+24], v48 + ld [%sp+2223+0],%f2 C zero f2 + ld [%sp+2223+0],%f4 C zero f4 + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fxtod v00, v00 + fxtod v16, v16 + fxtod v32, v32 + fxtod v48, v48 + +C Start real work. (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + + fxtod %f2, u00 + fxtod %f4, u32 + fmuld u00, v00, a00 + fmuld u00, v16, a16 + fmuld u00, v32, p32 + fmuld u32, v00, r32 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_two_or_more + fmuld u32, v16, r48 + +.L_one: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + ldx [%i0+%i2], rlimb C read rp[i] + fdtox r80, a16 + ldx [%sp+2223+0], i00 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + add %i2, 8, %i2 + + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + add i00, %g5, %g5 C i00+ now in g5 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_1 + add %i2, 8, %i2 + +.L_two_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fxtod %f2, u00 + fxtod %f4, u32 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_three_or_more + fmuld u32, v16, r48 + +.L_two: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_2 + add %i2, 8, %i2 + +.L_three_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_four_or_more + fmuld u32, v16, r48 + +.L_three: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_3 + add %i2, 8, %i2 + +.L_four_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + faddd p16, r80, a16 + fmuld u00, v48, p48 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 + +.L_four: + b,a .L_out_4 + +C BEGIN MAIN LOOP + .align 16 +.Loop: +C 00 + srlx %o4, 16, %o5 C (x >> 16) + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 +C 01 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 +C 02 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 +C 03 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 +C 04 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 +C 05 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 +C 06 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 +C 07 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 +C 08 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 +C 09 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 +C 10 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 +C 11 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 +C 12 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + faddd p16, r80, a16 + fmuld u00, v48, p48 +C 13 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 +C END MAIN LOOP + +.L_out_4: + srlx %o4, 16, %o5 C (x >> 16) + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_3: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_2: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_1: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + or %i3, %o5, %o5 + stx %o5, [%i4+%i2] + + sllx i00, 0, %g2 + add %g2, cy, cy + sllx i16, 16, %g3 + add %g3, cy, cy + + return %i7+8 + mov cy, %o0 +EPILOGUE(mpn_addmul_1) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm new file mode 100644 index 0000000..37674d7 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/addmul_2.asm @@ -0,0 +1,551 @@ +dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb +dnl number and add the result to a n limb vector. + +dnl Copyright 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 9 +C UltraSPARC 3: 10 + +C Algorithm: We use 16 floating-point multiplies per limb product, with the +C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand +C split into 32-bit pieces. We sum four 48-bit partial products using +C floating-point add, then convert the resulting four 50-bit quantities and +C transfer them to the integer unit. + +C Possible optimizations: +C 1. Align the stack area where we transfer the four 50-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before up?) +C 2. Perform two of the fp->int conversions with integer instructions. We +C can get almost ten free IEU slots, if we clean up bookkeeping and the +C silly carry-limb code. +C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb +C code. + +C OSP (Overlapping software pipeline) version of mpn_mul_basecase: +C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles. +C FI = 20 +C L = 9 x un * vn +C WDFI = 10 x vn / 2 +C WD = 4 + +C Instruction classification (as per UltraSPARC functional units). +C Assuming silly carry code is fixed. Includes bookkeeping. +C +C mpn_addmul_X mpn_mul_X +C 1 2 1 2 +C ========== ========== +C FM 8 16 8 16 +C FA 10 18 10 18 +C MEM 12 12 10 10 +C ISHIFT 6 6 6 6 +C IADDLOG 11 11 10 10 +C BRANCH 1 1 1 1 +C +C TOTAL IEU 17 17 16 16 +C TOTAL 48 64 45 61 +C +C IEU cycles 8.5 8.5 8 8 +C MEM cycles 12 12 10 10 +C ISSUE cycles 12 16 11.25 15.25 +C FPU cycles 10 18 10 18 +C cycles/loop 12 18 12 18 +C cycles/limb 12 9 12 9 + + +C INPUT PARAMETERS +C rp[n + 1] i0 +C up[n] i1 +C n i2 +C vp[2] i3 + + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +C Combine registers: +C u00_hi= u32_hi +C u00_lo= u32_lo +C a000 = out000 +C a016 = out016 +C Free: f52 f54 + + +define(`p000', `%f8') define(`p016',`%f10') +define(`p032',`%f12') define(`p048',`%f14') +define(`p064',`%f16') define(`p080',`%f18') +define(`p096a',`%f20') define(`p112a',`%f22') +define(`p096b',`%f56') define(`p112b',`%f58') + +define(`out000',`%f0') define(`out016',`%f6') + +define(`v000',`%f24') define(`v016',`%f26') +define(`v032',`%f28') define(`v048',`%f30') +define(`v064',`%f44') define(`v080',`%f46') +define(`v096',`%f48') define(`v112',`%f50') + +define(`u00',`%f32') define(`u32', `%f34') + +define(`a000',`%f36') define(`a016',`%f38') +define(`a032',`%f40') define(`a048',`%f42') +define(`a064',`%f60') define(`a080',`%f62') + +define(`u00_hi',`%f2') define(`u32_hi',`%f4') +define(`u00_lo',`%f3') define(`u32_lo',`%f5') + +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') +define(`r00',`%l2') define(`r32',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + + +PROLOGUE(mpn_addmul_2) + +C Initialization. (1) Split v operand into eight 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. +C This code could be better scheduled. + + save %sp, -256, %sp + +ifdef(`HAVE_VIS', +` mov -1, %g4 + wr %g0, 0xD2, %asi + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + ldda [%i3+6] %asi, v000 + ldda [%i3+4] %asi, v016 + ldda [%i3+2] %asi, v032 + ldda [%i3+0] %asi, v048 + fxtod v000, v000 + ldda [%i3+14] %asi, v064 + fxtod v016, v016 + ldda [%i3+12] %asi, v080 + fxtod v032, v032 + ldda [%i3+10] %asi, v096 + fxtod v048, v048 + ldda [%i3+8] %asi, v112 + fxtod v064, v064 + fxtod v080, v080 + fxtod v096, v096 + fxtod v112, v112 + fzero u00_hi + fzero u32_hi +', +` mov -1, %g4 + ldx [%i3+0], %l0 C vp[0] + srlx %g4, 48, xffff C store mask in register `xffff' + ldx [%i3+8], %l1 C vp[1] + + and %l0, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %l0, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %l0, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %l0, 48, %g3 + stx %g3, [%sp+2223+24] + and %l1, xffff, %g2 + stx %g2, [%sp+2223+32] + srlx %l1, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+40] + srlx %l1, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+48] + srlx %l1, 48, %g3 + stx %g3, [%sp+2223+56] + + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + ldd [%sp+2223+0], v000 + ldd [%sp+2223+8], v016 + ldd [%sp+2223+16], v032 + ldd [%sp+2223+24], v048 + fxtod v000, v000 + ldd [%sp+2223+32], v064 + fxtod v016, v016 + ldd [%sp+2223+40], v080 + fxtod v032, v032 + ldd [%sp+2223+48], v096 + fxtod v048, v048 + ldd [%sp+2223+56], v112 + fxtod v064, v064 + ld [%sp+2223+0], u00_hi C zero u00_hi + fxtod v080, v080 + ld [%sp+2223+0], u32_hi C zero u32_hi + fxtod v096, v096 + fxtod v112, v112 +') +C Initialization done. + mov 0, %g2 + mov 0, rlimb + mov 0, %g4 + add %i0, -8, %i0 C BOOKKEEPING + +C Start software pipeline. + + ld [%i1+4], u00_lo C read low 32 bits of up[i] + fxtod u00_hi, u00 +C mid + ld [%i1+0], u32_lo C read high 32 bits of up[i] + fmuld u00, v000, a000 + fmuld u00, v016, a016 + fmuld u00, v032, a032 + fmuld u00, v048, a048 + add %i2, -1, %i2 C BOOKKEEPING + fmuld u00, v064, p064 + add %i1, 8, %i1 C BOOKKEEPING + fxtod u32_hi, u32 + fmuld u00, v080, p080 + fmuld u00, v096, p096a + brnz,pt %i2, .L_2_or_more + fmuld u00, v112, p112a + +.L1: fdtox a000, out000 + fmuld u32, v000, p000 + fdtox a016, out016 + fmuld u32, v016, p016 + fmovd p064, a064 + fmuld u32, v032, p032 + fmovd p080, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + std out016, [%sp+2223+24] + fxtod u00_hi, u00 + faddd p016, a048, a016 + fmuld u32, v080, p080 + faddd p032, a064, a032 + fmuld u32, v096, p096b + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + fdtox a000, out000 + fdtox a016, out016 + faddd p064, p096a, a064 + faddd p080, p112a, a080 + std out000, [%sp+2223+0] + b .L_wd2 + std out016, [%sp+2223+8] + +.L_2_or_more: + ld [%i1+4], u00_lo C read low 32 bits of up[i] + fdtox a000, out000 + fmuld u32, v000, p000 + fdtox a016, out016 + fmuld u32, v016, p016 + fmovd p064, a064 + fmuld u32, v032, p032 + fmovd p080, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + std out016, [%sp+2223+24] + fxtod u00_hi, u00 + faddd p016, a048, a016 + fmuld u32, v080, p080 + faddd p032, a064, a032 + fmuld u32, v096, p096b + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + ld [%i1+0], u32_lo C read high 32 bits of up[i] + fdtox a000, out000 + fmuld u00, v000, p000 + fdtox a016, out016 + fmuld u00, v016, p016 + faddd p064, p096a, a064 + fmuld u00, v032, p032 + faddd p080, p112a, a080 + fmuld u00, v048, p048 + add %i2, -1, %i2 C BOOKKEEPING + std out000, [%sp+2223+0] + faddd p000, a032, a000 + fmuld u00, v064, p064 + add %i1, 8, %i1 C BOOKKEEPING + std out016, [%sp+2223+8] + fxtod u32_hi, u32 + faddd p016, a048, a016 + fmuld u00, v080, p080 + faddd p032, a064, a032 + fmuld u00, v096, p096a + faddd p048, a080, a048 + brnz,pt %i2, .L_3_or_more + fmuld u00, v112, p112a + + b .Lend + nop + +C 64 32 0 +C . . . +C . |__rXXX_| 32 +C . |___cy___| 34 +C . |_______i00__| 50 +C |_______i16__| . 50 + + +C BEGIN MAIN LOOP + .align 16 +.L_3_or_more: +.Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i] + and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u32, v000, p000 +C + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u32, v016, p016 +C + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + faddd p064, p096b, a064 + fmuld u32, v032, p032 +C + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + faddd p080, p112b, a080 + fmuld u32, v048, p048 +C + nop + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 +C + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + fxtod u00_hi, u00 +C + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u32, v080, p080 +C + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u32, v096, p096b +C + stw %l5, [%i0+4] + nop + faddd p048, a080, a048 + fmuld u32, v112, p112b +C midloop + ld [%i1+0], u32_lo C read high 32 bits of up[i] + and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u00, v000, p000 +C + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u00, v016, p016 +C + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + faddd p064, p096a, a064 + fmuld u00, v032, p032 +C + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + faddd p080, p112a, a080 + fmuld u00, v048, p048 +C + add %i2, -1, %i2 C BOOKKEEPING + std out000, [%sp+2223+0] + faddd p000, a032, a000 + fmuld u00, v064, p064 +C + add i00, r32, rlimb + add %i1, 8, %i1 C BOOKKEEPING + std out016, [%sp+2223+8] + fxtod u32_hi, u32 +C + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u00, v080, p080 +C + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u00, v096, p096a +C + stw %l5, [%i0+0] + faddd p048, a080, a048 + brnz,pt %i2, .Loop + fmuld u00, v112, p112a +C END MAIN LOOP + +C WIND-DOWN PHASE 1 +.Lend: and %g2, xffffffff, %g2 + fdtox a000, out000 + fmuld u32, v000, p000 + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + fmuld u32, v016, p016 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + faddd p064, p096b, a064 + fmuld u32, v032, p032 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + faddd p080, p112b, a080 + fmuld u32, v048, p048 + std out000, [%sp+2223+16] + faddd p000, a032, a000 + fmuld u32, v064, p064 + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + faddd p016, a048, a016 + fmuld u32, v080, p080 + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + faddd p032, a064, a032 + fmuld u32, v096, p096b + stw %l5, [%i0+4] + faddd p048, a080, a048 + fmuld u32, v112, p112b +C mid + and %g2, xffffffff, %g2 + fdtox a000, out000 + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a016, out016 + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + faddd p064, p096a, a064 + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + faddd p080, p112a, a080 + std out000, [%sp+2223+0] + add i00, r32, rlimb + std out016, [%sp+2223+8] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + +C WIND-DOWN PHASE 2 +.L_wd2: and %g2, xffffffff, %g2 + fdtox a032, out000 + lduw [%i0+4+8], r00 C read low 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a048, out016 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + std out000, [%sp+2223+16] + add i00, r00, rlimb + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+4] +C mid + and %g2, xffffffff, %g2 + fdtox a064, out000 + lduw [%i0+0], r32 C read high 32 bits of rp[i] + add %g2, rlimb, %l5 + fdtox a080, out016 + srlx %l5, 32, cy + ldx [%sp+2223+0], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + std out000, [%sp+2223+0] + add i00, r32, rlimb + std out016, [%sp+2223+8] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + +C WIND-DOWN PHASE 3 +.L_wd3: and %g2, xffffffff, %g2 + fdtox p096b, out000 + add %g2, rlimb, %l5 + fdtox p112b, out016 + srlx %l5, 32, cy + ldx [%sp+2223+16], rlimb + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + std out000, [%sp+2223+16] + add %i0, 8, %i0 C BOOKKEEPING + std out016, [%sp+2223+24] + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+4] +C mid + and %g2, xffffffff, %g2 + add %g2, rlimb, %l5 + srlx %l5, 32, cy + ldx [%sp+2223+0], rlimb + add %g4, cy, cy C new cy + ldx [%sp+2223+8], i16 + sllx i16, 16, %g2 + add cy, rlimb, rlimb + srlx i16, 16, %g4 + add %g2, rlimb, %l5 + stw %l5, [%i0+0] + + and %g2, xffffffff, %g2 + add %g2, rlimb, %l5 + srlx %l5, 32, cy + ldx [%sp+2223+16], i00 + add %g4, cy, cy C new cy + ldx [%sp+2223+24], i16 + + sllx i16, 16, %g2 + add i00, cy, cy + return %i7+8 + add %g2, cy, %o0 +EPILOGUE(mpn_addmul_2) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm new file mode 100644 index 0000000..47286d5 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/lshiftc.asm @@ -0,0 +1,165 @@ +dnl SPARC v9 mpn_lshiftc + +dnl Copyright 1996, 2000-2003, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 3 +C UltraSPARC 3: 2.67 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`cnt',`%i3') + +define(`u0', `%l0') +define(`u1', `%l2') +define(`u2', `%l4') +define(`u3', `%l6') + +define(`tnc',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_lshiftc) + save %sp,-160,%sp + + sllx n,3,%g1 + sub %g0,cnt,tnc C negate shift count + add up,%g1,up C make %o1 point at end of src + add rp,%g1,rp C make %o0 point at end of res + ldx [up-8],u3 C load first limb + subcc n,5,n + srlx u3,tnc,%i5 C compute function result + bl,pn %xcc,.Lend1234 + sllx u3,cnt,%g3 + + subcc n,4,n + ldx [up-16],u0 + ldx [up-24],u1 + add up,-32,up + ldx [up-0],u2 + ldx [up-8],u3 + srlx u0,tnc,%g2 + bl,pn %xcc,.Lend5678 + not %g3, %g3 + + b,a .Loop + ALIGN(16) +.Loop: + sllx u0,cnt,%g1 + andn %g3,%g2,%g3 + ldx [up-16],u0 + fanop +C -- + srlx u1,tnc,%g2 + subcc n,4,n + stx %g3,[rp-8] + not %g1, %g1 +C -- + sllx u1,cnt,%g3 + andn %g1,%g2,%g1 + ldx [up-24],u1 + fanop +C -- + srlx u2,tnc,%g2 + stx %g1,[rp-16] + add up,-32,up + not %g3, %g3 +C -- + sllx u2,cnt,%g1 + andn %g3,%g2,%g3 + ldx [up-0],u2 + fanop +C -- + srlx u3,tnc,%g2 + stx %g3,[rp-24] + add rp,-32,rp + not %g1, %g1 +C -- + sllx u3,cnt,%g3 + andn %g1,%g2,%g1 + ldx [up-8],u3 + fanop +C -- + srlx u0,tnc,%g2 + stx %g1,[rp-0] + bge,pt %xcc,.Loop + not %g3, %g3 +C -- +.Lend5678: + sllx u0,cnt,%g1 + andn %g3,%g2,%g3 + srlx u1,tnc,%g2 + stx %g3,[rp-8] + not %g1, %g1 + sllx u1,cnt,%g3 + andn %g1,%g2,%g1 + srlx u2,tnc,%g2 + stx %g1,[rp-16] + not %g3, %g3 + sllx u2,cnt,%g1 + andn %g3,%g2,%g3 + srlx u3,tnc,%g2 + stx %g3,[rp-24] + add rp,-32,rp + not %g1, %g1 + sllx u3,cnt,%g3 C carry... + andn %g1,%g2,%g1 + stx %g1,[rp-0] + +.Lend1234: + addcc n,4,n + bz,pn %xcc,.Lret + fanop +.Loop0: + add rp,-8,rp + subcc n,1,n + ldx [up-16],u3 + add up,-8,up + srlx u3,tnc,%g2 + not %g3, %g3 + andn %g3,%g2,%g3 + stx %g3,[rp] + sllx u3,cnt,%g3 + bnz,pt %xcc,.Loop0 + fanop +.Lret: + not %g3, %g3 + stx %g3,[rp-8] + mov %i5,%i0 + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm new file mode 100644 index 0000000..871d562 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/mul_1.asm @@ -0,0 +1,580 @@ +dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright 1998, 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 14 +C UltraSPARC 3: 18.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the s1 operand split +C into 32-bit pieces. We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C 1. Align the stack area where we transfer the four 49-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before s1?) +C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C develop mpn_addmul_2. This would save many integer instructions. +C 3. Unrolling. Questionable if it is worth the code expansion, given that +C it could only save 1 cycle/limb. +C 4. Specialize for particular v values. If its upper 32 bits are zero, we +C could save many operations, in the FPU (fmuld), but more so in the IEU +C since we'll be summing 48-bit quantities, which might be simpler. +C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should +C not be greater than needed for L2 cache latency, and also not so great +C that i16 needs to be copied. +C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU +C ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C 8 FM +C 10 FA +C 11 MEM +C 9 ISHIFT + 10? IADDLOG +C 1 BRANCH +C 49 insns totally (plus three mov insns that should be optimized out) + +C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain 3.79 instructions/cycle. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_mul_1) + +C Initialization. (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. + + save %sp, -256, %sp + mov -1, %g4 + srlx %g4, 48, xffff C store mask in register `xffff' + and %i3, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %i3, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %i3, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %i3, 48, %g3 + stx %g3, [%sp+2223+24] + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + sllx %i2, 3, %i2 + mov 0, cy C clear cy + add %i0, %i2, %i0 + add %i1, %i2, %i1 + neg %i2 + add %i1, 4, %i5 + add %i0, -32, %i4 + add %i0, -16, %i0 + + ldd [%sp+2223+0], v00 + ldd [%sp+2223+8], v16 + ldd [%sp+2223+16], v32 + ldd [%sp+2223+24], v48 + ld [%sp+2223+0],%f2 C zero f2 + ld [%sp+2223+0],%f4 C zero f4 + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fxtod v00, v00 + fxtod v16, v16 + fxtod v32, v32 + fxtod v48, v48 + +C Start real work. (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + + fxtod %f2, u00 + fxtod %f4, u32 + fmuld u00, v00, a00 + fmuld u00, v16, a16 + fmuld u00, v32, p32 + fmuld u32, v00, r32 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_two_or_more + fmuld u32, v16, r48 + +.L_one: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + add %i2, 8, %i2 + + mov i00, %g5 C i00+ now in g5 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_1 + add %i2, 8, %i2 + +.L_two_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fxtod %f2, u00 + fxtod %f4, u32 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_three_or_more + fmuld u32, v16, r48 + +.L_two: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + mov i00, %g5 C i00+ now in g5 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_2 + add %i2, 8, %i2 + +.L_three_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %xcc, .L_four_or_more + fmuld u32, v16, r48 + +.L_three: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_3 + add %i2, 8, %i2 + +.L_four_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + faddd p16, r80, a16 + fmuld u00, v48, p48 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 + +.L_four: + b,a .L_out_4 + +C BEGIN MAIN LOOP + .align 16 +.Loop: +C 00 + srlx %o4, 16, %o5 C (x >> 16) + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 +C 01 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 +C 02 + faddd p48, r48, a48 +C 03 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 +C 04 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 +C 05 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 +C 06 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 +C 07 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 +C 08 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 +C 09 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 +C 10 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 +C 11 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 +C 12 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + faddd p16, r80, a16 + fmuld u00, v48, p48 +C 13 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %xcc, .Loop + fmuld u32, v16, r48 +C END MAIN LOOP + +.L_out_4: + srlx %o4, 16, %o5 C (x >> 16) + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox a00, a00 + faddd p48, r48, a48 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_3: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox r64, a00 + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + fdtox r80, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_2: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + mov i00, %g5 C i00+ now in g5 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + mov i32, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_1: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + or %i3, %o5, %o5 + stx %o5, [%i4+%i2] + + sllx i00, 0, %g2 + add %g2, cy, cy + sllx i16, 16, %g3 + add %g3, cy, cy + + return %i7+8 + mov cy, %o0 +EPILOGUE(mpn_mul_1) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm new file mode 100644 index 0000000..43c69d3 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm @@ -0,0 +1,342 @@ +dnl SPARC v9 64-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 22 +C UltraSPARC 3: 36 + +C This was generated by the Sun C compiler. It runs at 22 cycles/limb on the +C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal +C code using the same algorithm. For 1-3 limbs, a special loop was generated, +C which causes performance problems in particular for 2 and 3 limbs. +C Ultimately, this should be replaced by hand-written code in the same software +C pipeline style as e.g., addmul_1.asm. + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sqr_diagonal) + save %sp, -240, %sp + + sethi %hi(0x1ffc00), %o0 + sethi %hi(0x3ffc00), %o1 + add %o0, 1023, %o7 + cmp %i2, 4 + add %o1, 1023, %o4 + or %g0, %i1, %g1 + or %g0, %i0, %o0 + bl,pn %xcc, .Lsmall + or %g0, 0, %g2 + + ldx [%i1], %o1 + add %i1, 24, %g1 + or %g0, 3, %g2 + srlx %o1, 42, %g3 + stx %g3, [%sp+2279] + and %o1, %o7, %o2 + stx %o2, [%sp+2263] + srlx %o1, 21, %o1 + ldd [%sp+2279], %f0 + and %o1, %o7, %o1 + stx %o1, [%sp+2271] + ldx [%i1+8], %o2 + fxtod %f0, %f12 + srlx %o2, 21, %o1 + and %o2, %o7, %g3 + ldd [%sp+2263], %f2 + fmuld %f12, %f12, %f10 + srlx %o2, 42, %o2 + ldd [%sp+2271], %f0 + and %o1, %o7, %o1 + fxtod %f2, %f8 + stx %o2, [%sp+2279] + stx %o1, [%sp+2271] + fxtod %f0, %f0 + stx %g3, [%sp+2263] + fdtox %f10, %f14 + fmuld %f12, %f8, %f6 + ldx [%i1+16], %o2 + std %f14, [%sp+2255] + fmuld %f0, %f0, %f2 + fmuld %f8, %f8, %f10 + srlx %o2, 42, %o1 + faddd %f6, %f6, %f6 + fmuld %f12, %f0, %f12 + fmuld %f0, %f8, %f8 + ldd [%sp+2279], %f0 + ldd [%sp+2263], %f4 + fdtox %f10, %f10 + std %f10, [%sp+2239] + faddd %f2, %f6, %f6 + ldd [%sp+2271], %f2 + fdtox %f12, %f12 + std %f12, [%sp+2247] + fdtox %f8, %f8 + std %f8, [%sp+2231] + fdtox %f6, %f6 + std %f6, [%sp+2223] + +.Loop: srlx %o2, 21, %g3 + stx %o1, [%sp+2279] + add %g2, 1, %g2 + and %g3, %o7, %o1 + ldx [%sp+2255], %g4 + cmp %g2, %i2 + stx %o1, [%sp+2271] + add %g1, 8, %g1 + add %o0, 16, %o0 + ldx [%sp+2239], %o1 + fxtod %f0, %f10 + fxtod %f4, %f14 + ldx [%sp+2231], %i0 + ldx [%sp+2223], %g5 + ldx [%sp+2247], %g3 + and %o2, %o7, %o2 + fxtod %f2, %f8 + fmuld %f10, %f10, %f0 + stx %o2, [%sp+2263] + fmuld %f10, %f14, %f6 + ldx [%g1-8], %o2 + fmuld %f10, %f8, %f12 + fdtox %f0, %f2 + ldd [%sp+2279], %f0 + fmuld %f8, %f8, %f4 + faddd %f6, %f6, %f6 + fmuld %f14, %f14, %f10 + std %f2, [%sp+2255] + sllx %g4, 20, %g4 + ldd [%sp+2271], %f2 + fmuld %f8, %f14, %f8 + sllx %i0, 22, %i1 + fdtox %f12, %f12 + std %f12, [%sp+2247] + sllx %g5, 42, %i0 + add %o1, %i1, %o1 + faddd %f4, %f6, %f6 + ldd [%sp+2263], %f4 + add %o1, %i0, %o1 + add %g3, %g4, %g3 + fdtox %f10, %f10 + std %f10, [%sp+2239] + srlx %o1, 42, %g4 + and %g5, %o4, %i0 + fdtox %f8, %f8 + std %f8, [%sp+2231] + srlx %g5, 22, %g5 + sub %g4, %i0, %g4 + fdtox %f6, %f6 + std %f6, [%sp+2223] + srlx %g4, 63, %g4 + add %g3, %g5, %g3 + add %g3, %g4, %g3 + stx %o1, [%o0-16] + srlx %o2, 42, %o1 + bl,pt %xcc, .Loop + stx %g3, [%o0-8] + + stx %o1, [%sp+2279] + srlx %o2, 21, %o1 + fxtod %f0, %f16 + ldx [%sp+2223], %g3 + fxtod %f4, %f6 + and %o2, %o7, %o3 + stx %o3, [%sp+2263] + fxtod %f2, %f4 + and %o1, %o7, %o1 + ldx [%sp+2231], %o2 + sllx %g3, 42, %g4 + fmuld %f16, %f16, %f14 + stx %o1, [%sp+2271] + fmuld %f16, %f6, %f8 + add %o0, 48, %o0 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + fmuld %f4, %f4, %f10 + ldx [%sp+2255], %o3 + fdtox %f14, %f14 + fmuld %f4, %f6, %f2 + std %f14, [%sp+2255] + faddd %f8, %f8, %f12 + add %o1, %o2, %o2 + fmuld %f16, %f4, %f4 + ldd [%sp+2279], %f0 + sllx %o3, 20, %g5 + add %o2, %g4, %o2 + fmuld %f6, %f6, %f6 + srlx %o2, 42, %o3 + and %g3, %o4, %g4 + srlx %g3, 22, %g3 + faddd %f10, %f12, %f16 + ldd [%sp+2271], %f12 + ldd [%sp+2263], %f8 + fxtod %f0, %f0 + sub %o3, %g4, %o3 + ldx [%sp+2247], %o1 + srlx %o3, 63, %o3 + fdtox %f2, %f10 + fxtod %f8, %f8 + std %f10, [%sp+2231] + fdtox %f6, %f6 + std %f6, [%sp+2239] + add %o1, %g5, %o1 + fmuld %f0, %f0, %f2 + fdtox %f16, %f16 + std %f16, [%sp+2223] + add %o1, %g3, %o1 + fdtox %f4, %f4 + std %f4, [%sp+2247] + fmuld %f0, %f8, %f10 + fxtod %f12, %f12 + add %o1, %o3, %o1 + stx %o2, [%o0-48] + fmuld %f8, %f8, %f6 + stx %o1, [%o0-40] + fdtox %f2, %f2 + ldx [%sp+2231], %o2 + faddd %f10, %f10, %f10 + ldx [%sp+2223], %g3 + fmuld %f12, %f12, %f4 + fdtox %f6, %f6 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + fmuld %f12, %f8, %f8 + sllx %g3, 42, %g5 + ldx [%sp+2255], %o3 + fmuld %f0, %f12, %f0 + add %o1, %o2, %o2 + faddd %f4, %f10, %f4 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + and %g3, %o4, %g4 + fdtox %f8, %f8 + sllx %o3, 20, %g5 + std %f8, [%sp+2231] + fdtox %f0, %f0 + srlx %o2, 42, %o3 + add %o1, %g5, %o1 + fdtox %f4, %f4 + srlx %g3, 22, %g3 + sub %o3, %g4, %o3 + std %f6, [%sp+2239] + std %f4, [%sp+2223] + srlx %o3, 63, %o3 + add %o1, %g3, %o1 + std %f2, [%sp+2255] + add %o1, %o3, %o1 + std %f0, [%sp+2247] + stx %o2, [%o0-32] + stx %o1, [%o0-24] + ldx [%sp+2231], %o2 + ldx [%sp+2223], %o3 + ldx [%sp+2239], %o1 + sllx %o2, 22, %o2 + sllx %o3, 42, %g5 + ldx [%sp+2255], %g4 + and %o3, %o4, %g3 + add %o1, %o2, %o2 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + stx %o2, [%o0-16] + sllx %g4, 20, %g4 + srlx %o2, 42, %o2 + add %o1, %g4, %o1 + srlx %o3, 22, %o3 + sub %o2, %g3, %o2 + srlx %o2, 63, %o2 + add %o1, %o3, %o1 + add %o1, %o2, %o1 + stx %o1, [%o0-8] + ret + restore %g0, %g0, %g0 +.Lsmall: + ldx [%g1], %o2 +.Loop0: + and %o2, %o7, %o1 + stx %o1, [%sp+2263] + add %g2, 1, %g2 + srlx %o2, 21, %o1 + add %g1, 8, %g1 + srlx %o2, 42, %o2 + stx %o2, [%sp+2279] + and %o1, %o7, %o1 + ldd [%sp+2263], %f0 + cmp %g2, %i2 + stx %o1, [%sp+2271] + fxtod %f0, %f6 + ldd [%sp+2279], %f0 + ldd [%sp+2271], %f4 + fxtod %f0, %f2 + fmuld %f6, %f6, %f0 + fxtod %f4, %f10 + fmuld %f2, %f6, %f4 + fdtox %f0, %f0 + std %f0, [%sp+2239] + fmuld %f10, %f6, %f8 + fmuld %f10, %f10, %f0 + faddd %f4, %f4, %f6 + fmuld %f2, %f2, %f4 + fdtox %f8, %f8 + std %f8, [%sp+2231] + fmuld %f2, %f10, %f2 + faddd %f0, %f6, %f0 + fdtox %f4, %f4 + std %f4, [%sp+2255] + fdtox %f2, %f2 + std %f2, [%sp+2247] + fdtox %f0, %f0 + std %f0, [%sp+2223] + ldx [%sp+2239], %o1 + ldx [%sp+2255], %g4 + ldx [%sp+2231], %o2 + sllx %g4, 20, %g4 + ldx [%sp+2223], %o3 + sllx %o2, 22, %o2 + sllx %o3, 42, %g5 + add %o1, %o2, %o2 + ldx [%sp+2247], %o1 + add %o2, %g5, %o2 + stx %o2, [%o0] + and %o3, %o4, %g3 + srlx %o2, 42, %o2 + add %o1, %g4, %o1 + srlx %o3, 22, %o3 + sub %o2, %g3, %o2 + srlx %o2, 63, %o2 + add %o1, %o3, %o1 + add %o1, %o2, %o1 + stx %o1, [%o0+8] + add %o0, 16, %o0 + bl,a,pt %xcc, .Loop0 + ldx [%g1], %o2 + ret + restore %g0, %g0, %g0 +EPILOGUE(mpn_sqr_diagonal) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm new file mode 100644 index 0000000..9fb7f70 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/sub_n.asm @@ -0,0 +1,241 @@ +dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 4 +C UltraSPARC 3: 4.5 + +C Compute carry-out from the most significant bits of u,v, and r, where +C r=u-v-carry_in, using logic operations. + +C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn +C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. +C Therefore, it seems futile to try to optimize this any further... + +C INPUT PARAMETERS +define(`rp',`%i0') +define(`up',`%i1') +define(`vp',`%i2') +define(`n',`%i3') + +define(`u0',`%l0') +define(`u1',`%l2') +define(`u2',`%l4') +define(`u3',`%l6') +define(`v0',`%l1') +define(`v1',`%l3') +define(`v2',`%l5') +define(`v3',`%l7') + +define(`cy',`%i4') + +define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe +define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + nop + b,a L(com) +EPILOGUE() + +PROLOGUE(mpn_sub_n) + save %sp,-160,%sp + + fitod %f0,%f0 C make sure f0 contains small, quiet number + subcc n,4,%g0 + bl,pn %xcc,.Loop0 + mov 0,cy +L(com): + ldx [up+0],u0 + ldx [vp+0],v0 + add up,32,up + ldx [up-24],u1 + ldx [vp+8],v1 + add vp,32,vp + ldx [up-16],u2 + ldx [vp-16],v2 + ldx [up-8],u3 + ldx [vp-8],v3 + subcc n,8,n + sub u0,v0,%g1 C main sub + sub %g1,cy,%g5 C carry sub + orn u0,v0,%g2 + bl,pn %xcc,.Lend4567 + fanop + b,a .Loop + + .align 16 +C START MAIN LOOP +.Loop: orn %g5,%g2,%g2 + andn u0,v0,%g3 + ldx [up+0],u0 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp+0],v0 + add up,32,up + fanop +C -- + srlx %g2,63,cy + sub u1,v1,%g1 + stx %g5,[rp+0] + fanop +C -- + sub %g1,cy,%g5 + orn u1,v1,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u1,v1,%g3 + ldx [up-24],u1 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp+8],v1 + add vp,32,vp + fanop +C -- + srlx %g2,63,cy + sub u2,v2,%g1 + stx %g5,[rp+8] + fanop +C -- + sub %g1,cy,%g5 + orn u2,v2,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u2,v2,%g3 + ldx [up-16],u2 + fanop +C -- + andn %g2,%g3,%g2 + ldx [vp-16],v2 + add rp,32,rp + fanop +C -- + srlx %g2,63,cy + sub u3,v3,%g1 + stx %g5,[rp-16] + fanop +C -- + sub %g1,cy,%g5 + orn u3,v3,%g2 + fmnop + fanop +C -- + orn %g5,%g2,%g2 + andn u3,v3,%g3 + ldx [up-8],u3 + fanop +C -- + andn %g2,%g3,%g2 + subcc n,4,n + ldx [vp-8],v3 + fanop +C -- + srlx %g2,63,cy + sub u0,v0,%g1 + stx %g5,[rp-8] + fanop +C -- + sub %g1,cy,%g5 + orn u0,v0,%g2 + bge,pt %xcc,.Loop + fanop +C END MAIN LOOP +.Lend4567: + orn %g5,%g2,%g2 + andn u0,v0,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + sub u1,v1,%g1 + stx %g5,[rp+0] + sub %g1,cy,%g5 + orn u1,v1,%g2 + orn %g5,%g2,%g2 + andn u1,v1,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + sub u2,v2,%g1 + stx %g5,[rp+8] + sub %g1,cy,%g5 + orn u2,v2,%g2 + orn %g5,%g2,%g2 + andn u2,v2,%g3 + andn %g2,%g3,%g2 + add rp,32,rp + srlx %g2,63,cy + sub u3,v3,%g1 + stx %g5,[rp-16] + sub %g1,cy,%g5 + orn u3,v3,%g2 + orn %g5,%g2,%g2 + andn u3,v3,%g3 + andn %g2,%g3,%g2 + srlx %g2,63,cy + stx %g5,[rp-8] + + addcc n,4,n + bz,pn %xcc,.Lret + fanop + +.Loop0: ldx [up],u0 + add up,8,up + ldx [vp],v0 + add vp,8,vp + add rp,8,rp + subcc n,1,n + sub u0,v0,%g1 + orn u0,v0,%g2 + sub %g1,cy,%g5 + andn u0,v0,%g3 + orn %g5,%g2,%g2 + stx %g5,[rp-8] + andn %g2,%g3,%g2 + bnz,pt %xcc,.Loop0 + srlx %g2,63,cy + +.Lret: mov cy,%i0 + ret + restore +EPILOGUE(mpn_sub_n) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm new file mode 100644 index 0000000..0bdb566 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc1234/submul_1.asm @@ -0,0 +1,68 @@ +dnl SPARC v9 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 2001-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 18 +C UltraSPARC 3: 23 + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + +PROLOGUE(mpn_submul_1) + save %sp,-176,%sp + + sllx %i2, 3, %g2 + or %g0, %i1, %o1 + add %g2, 15, %o0 + or %g0, %i2, %o2 + and %o0, -16, %o0 + sub %sp, %o0, %sp + add %sp, 2223, %o0 + or %g0, %o0, %l0 + call mpn_mul_1 + or %g0, %i3, %o3 + or %g0, %o0, %l1 C preserve carry value from mpn_mul_1 + or %g0, %i0, %o0 + or %g0, %i0, %o1 + or %g0, %l0, %o2 + call mpn_sub_n + or %g0, %i2, %o3 + ret + restore %l1, %o0, %o0 C sum carry values +EPILOGUE(mpn_submul_1) diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h b/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h new file mode 100644 index 0000000..c88e680 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparc34/gmp-mparam.h @@ -0,0 +1,222 @@ +/* ultrasparc3/4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010, 2014, 2015 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */ +/* FFT tuning limit = 100 M */ +/* Generated by tuneup.c, 2015-10-09, gcc 3.4 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 22 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 29 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 142 +#define MUL_TOOM6H_THRESHOLD 165 +#define MUL_TOOM8H_THRESHOLD 278 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 50 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 67 + +#define SQR_BASECASE_THRESHOLD 7 +#define SQR_TOOM2_THRESHOLD 70 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 184 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 339 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 212 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 212, 5}, { 13, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23, 8}, { 47, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 63, 8}, { 127, 7}, { 255, 9}, \ + { 67,10}, { 39, 9}, { 79, 8}, { 159, 7}, \ + { 319, 9}, { 83,10}, { 47, 9}, { 95, 8}, \ + { 191, 7}, { 383,10}, { 55,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 7}, { 511,10}, \ + { 71, 9}, { 143, 8}, { 287,10}, { 79, 9}, \ + { 159, 8}, { 319, 9}, { 175, 8}, { 351,11}, \ + { 47,10}, { 95, 9}, { 191, 8}, { 383, 7}, \ + { 767,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 207, 9}, { 415,11}, \ + { 111,10}, { 223, 9}, { 479,12}, { 63,11}, \ + { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,11}, { 223,10}, { 447,13}, { 63,12}, \ + { 127,11}, { 287,10}, { 575,11}, { 319,10}, \ + { 703,12}, { 191,11}, { 383,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 287,11}, { 575,12}, \ + { 351,13}, { 191,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 575,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 767,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 703,14}, { 383,13}, { 831,12}, \ + { 1663,13}, { 895,15}, { 255,14}, { 511,13}, \ + { 1151,14}, { 639,13}, { 1407,12}, { 2815,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 1023,13}, { 2047,14}, { 1151,13}, \ + { 2303,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1791,16}, { 511,15}, { 1023,14}, { 2303,15}, \ + { 1279,14}, { 2815,15}, { 1535,14}, { 3199,15}, \ + { 1791,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 171 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 8, 4}, { 17, 5}, { 15, 6}, \ + { 8, 5}, { 17, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 31, 9}, { 19, 8}, { 39, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 63, 9}, \ + { 127, 8}, { 255,10}, { 71, 9}, { 143, 8}, \ + { 287, 7}, { 575,10}, { 79, 9}, { 159,11}, \ + { 47, 9}, { 191, 8}, { 383, 7}, { 767, 9}, \ + { 207,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511,10}, { 135, 9}, { 271,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703, 7}, { 1407,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767,10}, { 207, 9}, { 415,10}, \ + { 223, 9}, { 447,12}, { 63,11}, { 127,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575, 8}, \ + { 1151,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 351, 9}, { 703, 8}, { 1407, 7}, { 2815,11}, \ + { 207,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,13}, { 63,11}, { 271,10}, \ + { 543,11}, { 287,12}, { 159,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 415,10}, { 831,12}, \ + { 223,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 415,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 703,10}, { 2815,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 895,15}, { 255,14}, { 511,13}, { 1215,14}, \ + { 639,13}, { 1279,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1407,15}, \ + { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2303,15}, { 1279,14}, { 2815,15}, { 1535,14}, \ + { 3199,15}, { 1791,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2815,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 184 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 29 +#define MULLO_MUL_N_THRESHOLD 4392 +#define SQRLO_BASECASE_THRESHOLD 2 +#define SQRLO_DC_THRESHOLD 63 +#define SQRLO_SQR_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 16 +#define DC_DIVAPPR_Q_THRESHOLD 64 +#define DC_BDIV_QR_THRESHOLD 30 +#define DC_BDIV_Q_THRESHOLD 86 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 17 +#define INV_APPR_THRESHOLD 15 + +#define BINV_NEWTON_THRESHOLD 109 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ +#define REDC_2_TO_REDC_N_THRESHOLD 117 + +#define MU_DIV_QR_THRESHOLD 618 +#define MU_DIVAPPR_Q_THRESHOLD 618 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 680 +#define MU_BDIV_Q_THRESHOLD 807 + +#define POWM_SEC_TABLE 3,22,102,579,1555 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1042 + +#define FAC_DSC_THRESHOLD 462 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 12 +#define HGCD_THRESHOLD 45 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 1094 +#define GCD_DC_THRESHOLD 126 +#define GCDEXT_DC_THRESHOLD 132 +#define JACOBI_BASE_METHOD 4 diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm new file mode 100644 index 0000000..954c7f6 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/add_n.asm @@ -0,0 +1,68 @@ +dnl SPARC v9 mpn_add_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: ? +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) + b,a L(ent) +EPILOGUE() +PROLOGUE(mpn_add_n) + mov 0, cy +L(ent): cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + add n, -1, n + srlx %o4, 32, %g1 + srlx %o5, 32, %g2 + addccc %o4, %o5, %g3 + addccc %g1, %g2, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + addc %g0, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm new file mode 100644 index 0000000..3134797 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh1_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_addlsh1_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +define(func, mpn_addlsh1_n) + +MULFUNC_PROLOGUE(mpn_addlsh1_n) + +include_mpn(`sparc64/ultrasparct1/addlshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm new file mode 100644 index 0000000..ee1afd0 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlsh2_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_addlsh2_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +define(func, mpn_addlsh2_n) + +MULFUNC_PROLOGUE(mpn_addlsh2_n) + +include_mpn(`sparc64/ultrasparct1/addlshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm new file mode 100644 index 0000000..5be9a0d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addlshC_n.asm @@ -0,0 +1,69 @@ +dnl SPARC v9 mpn_addlshC_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C UltraSPARC T1: 21 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + mov 0, cy + mov 0, %g5 + cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + + sllx %o5, LSH, %g4 + add n, -1, n + or %g5, %g4, %g4 + srlx %o5, RSH, %g5 + + srlx %o4, 32, %g1 + srlx %g4, 32, %g2 + addccc %o4, %g4, %g3 + addccc %g1, %g2, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + addc %g5, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm new file mode 100644 index 0000000..29dba96 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/addmul_1.asm @@ -0,0 +1,86 @@ +dnl SPARC v9 mpn_addmul_1 for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 74 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_addmul_1) + save %sp, -176, %sp + mov 1, %o2 + mov %i0, %g2 + srlx %i3, 32, %o4 + sllx %o2, 32, %o2 + srl %i3, 0, %i3 + mov 0, %g3 + mov 0, %i0 + +L(top): ldx [%i1+%g3], %g1 + srl %g1, 0, %g4 + mulx %g4, %i3, %o5 + srlx %g1, 32, %g1 + mulx %g1, %i3, %g5 + mulx %g4, %o4, %g4 + mulx %g1, %o4, %g1 + srlx %o5, 32, %o1 + add %g5, %o1, %o1 + addcc %o1, %g4, %g4 + srl %o5, 0, %o0 + ldx [%g2+%g3], %o5 + sllx %g4, 32, %o1 + add %g1, %o2, %l1 + movlu %xcc, %l1, %g1 + add %o1, %o0, %l0 + addcc %l0, %i0, %g5 + srlx %g4, 32, %i0 + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + addcc %o5, %g5, %g5 + stx %g5, [%g2+%g3] + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + add %i2, -1, %i2 + add %i0, %g1, %i0 + brnz,pt %i2, L(top) + add %g3, 8, %g3 + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h b/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h new file mode 100644 index 0000000..99db78a --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/gmp-mparam.h @@ -0,0 +1,154 @@ +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1000 MHz ultrasparc t1 running GNU/Linux */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 13 +#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 34 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define MUL_TOOM22_THRESHOLD 8 +#define MUL_TOOM33_THRESHOLD 50 +#define MUL_TOOM44_THRESHOLD 99 +#define MUL_TOOM6H_THRESHOLD 125 +#define MUL_TOOM8H_THRESHOLD 187 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 50 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 34 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 14 +#define SQR_TOOM3_THRESHOLD 57 +#define SQR_TOOM4_THRESHOLD 133 +#define SQR_TOOM6_THRESHOLD 156 +#define SQR_TOOM8_THRESHOLD 260 + +#define MULMID_TOOM42_THRESHOLD 12 + +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 7 + +#define MUL_FFT_MODF_THRESHOLD 176 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 176, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \ + { 5, 5}, { 11, 6}, { 11, 7}, { 6, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 9, 8}, \ + { 5, 7}, { 13, 8}, { 7, 7}, { 15, 6}, \ + { 32, 7}, { 24, 8}, { 21, 9}, { 11, 8}, \ + { 23,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23,10}, { 15, 9}, \ + { 43,10}, { 23,11}, { 15,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 67,10}, { 39, 9}, \ + { 79, 8}, { 159,10}, { 47, 9}, { 95,11}, \ + { 2048,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 53 +#define MUL_FFT_THRESHOLD 1728 + + +#define SQR_FFT_MODF_THRESHOLD 148 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 148, 5}, { 7, 6}, { 4, 5}, { 9, 6}, \ + { 5, 5}, { 11, 6}, { 11, 7}, { 6, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 13, 8}, \ + { 7, 7}, { 16, 8}, { 9, 6}, { 38, 7}, \ + { 20, 8}, { 11, 7}, { 24, 8}, { 13, 9}, \ + { 7, 7}, { 30, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 31, 9}, \ + { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47, 8}, { 95, 9}, \ + { 51,11}, { 15,10}, { 31, 8}, { 127,10}, \ + { 39, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95,11}, { 2048,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 58 +#define SQR_FFT_THRESHOLD 1344 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 28 +#define MULLO_MUL_N_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 27 +#define DC_DIVAPPR_Q_THRESHOLD 106 +#define DC_BDIV_QR_THRESHOLD 27 +#define DC_BDIV_Q_THRESHOLD 62 + +#define INV_MULMOD_BNM1_THRESHOLD 14 +#define INV_NEWTON_THRESHOLD 163 +#define INV_APPR_THRESHOLD 117 + +#define BINV_NEWTON_THRESHOLD 166 +#define REDC_1_TO_REDC_N_THRESHOLD 31 + +#define MU_DIV_QR_THRESHOLD 734 +#define MU_DIVAPPR_Q_THRESHOLD 748 +#define MUPI_DIV_QR_THRESHOLD 67 +#define MU_BDIV_QR_THRESHOLD 562 +#define MU_BDIV_Q_THRESHOLD 734 + +#define POWM_SEC_TABLE 4,29,188,643,2741 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD_THRESHOLD 58 +#define HGCD_APPR_THRESHOLD 55 +#define HGCD_REDUCE_THRESHOLD 637 +#define GCD_DC_THRESHOLD 186 +#define GCDEXT_DC_THRESHOLD 140 +#define JACOBI_BASE_METHOD 3 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_DC_THRESHOLD 268 +#define SET_STR_PRECOMPUTE_THRESHOLD 960 + +#define FAC_DSC_THRESHOLD 268 +#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm new file mode 100644 index 0000000..1fea2a1 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/mul_1.asm @@ -0,0 +1,82 @@ +dnl SPARC v9 mpn_mul_1 for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 68 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_mul_1) + save %sp, -176, %sp + mov 1, %o2 + mov %i0, %g2 + srlx %i3, 32, %o4 + sllx %o2, 32, %o2 + srl %i3, 0, %i3 + mov 0, %g3 + mov 0, %i0 + +L(top): ldx [%i1+%g3], %g1 + srl %g1, 0, %g4 + mulx %g4, %i3, %o5 + srlx %g1, 32, %g1 + mulx %g1, %i3, %g5 + mulx %g4, %o4, %g4 + mulx %g1, %o4, %g1 + srlx %o5, 32, %o1 + add %g5, %o1, %o1 + addcc %o1, %g4, %g4 + srl %o5, 0, %o0 + sllx %g4, 32, %o1 + add %g1, %o2, %l1 + movlu %xcc, %l1, %g1 + add %o1, %o0, %l0 + addcc %l0, %i0, %g5 + srlx %g4, 32, %i0 + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + stx %g5, [%g2+%g3] + add %i2, -1, %i2 + add %i0, %g1, %i0 + brnz,pt %i2, L(top) + add %g3, 8, %g3 + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm new file mode 100644 index 0000000..51bd4ab --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh1_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_rsblsh1_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +define(func, mpn_rsblsh1_n) + +MULFUNC_PROLOGUE(mpn_rsblsh1_n) + +include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm new file mode 100644 index 0000000..f0d208e --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblsh2_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_rsblsh2_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +define(func, mpn_rsblsh2_n) + +MULFUNC_PROLOGUE(mpn_rsblsh2_n) + +include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm new file mode 100644 index 0000000..7c03e9f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/rsblshC_n.asm @@ -0,0 +1,69 @@ +dnl SPARC v9 mpn_rsblshC_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C UltraSPARC T1: 21 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + mov 0, cy + mov 0, %g5 + cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + + sllx %o5, LSH, %g4 + add n, -1, n + or %g5, %g4, %g4 + srlx %o5, RSH, %g5 + + srlx %o4, 32, %g1 + srlx %g4, 32, %g2 + subccc %g4, %o4, %g3 + subccc %g2, %g1, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + subc %g5, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm new file mode 100644 index 0000000..c2af89f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sub_n.asm @@ -0,0 +1,68 @@ +dnl SPARC v9 mpn_sub_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: ? +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) + b,a L(ent) +EPILOGUE() +PROLOGUE(mpn_sub_n) + mov 0, cy +L(ent): cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + add n, -1, n + srlx %o4, 32, %g1 + srlx %o5, 32, %g2 + subccc %o4, %o5, %g3 + subccc %g1, %g2, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + addc %g0, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm new file mode 100644 index 0000000..8c8fa80 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh1_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_sublsh1_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +define(func, mpn_sublsh1_n) + +MULFUNC_PROLOGUE(mpn_sublsh1_n) + +include_mpn(`sparc64/ultrasparct1/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm new file mode 100644 index 0000000..2fd5eee --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublsh2_n.asm @@ -0,0 +1,41 @@ +dnl SPARC v9 mpn_sublsh2_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +define(func, mpn_sublsh2_n) + +MULFUNC_PROLOGUE(mpn_sublsh2_n) + +include_mpn(`sparc64/ultrasparct1/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm new file mode 100644 index 0000000..01eafef --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/sublshC_n.asm @@ -0,0 +1,69 @@ +dnl SPARC v9 mpn_sublshC_n for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C UltraSPARC T1: 21 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%o0') +define(`up', `%o1') +define(`vp', `%o2') +define(`n', `%o3') +define(`cy', `%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + mov 0, cy + mov 0, %g5 + cmp %g0, cy +L(top): ldx [up+0], %o4 + add up, 8, up + ldx [vp+0], %o5 + add vp, 8, vp + add rp, 8, rp + + sllx %o5, LSH, %g4 + add n, -1, n + or %g5, %g4, %g4 + srlx %o5, RSH, %g5 + + srlx %o4, 32, %g1 + srlx %g4, 32, %g2 + subccc %o4, %g4, %g3 + subccc %g1, %g2, %g0 + brgz n, L(top) + stx %g3, [rp-8] + + retl + addc %g5, %g0, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm new file mode 100644 index 0000000..4f553a8 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct1/submul_1.asm @@ -0,0 +1,86 @@ +dnl SPARC v9 mpn_submul_1 for T1/T2. + +dnl Copyright 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: 74 +C UltraSPARC T2: ? + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_submul_1) + save %sp, -176, %sp + mov 1, %o2 + mov %i0, %g2 + srlx %i3, 32, %o4 + sllx %o2, 32, %o2 + srl %i3, 0, %i3 + mov 0, %g3 + mov 0, %i0 + +L(top): ldx [%i1+%g3], %g1 + srl %g1, 0, %g4 + mulx %g4, %i3, %o5 + srlx %g1, 32, %g1 + mulx %g1, %i3, %g5 + mulx %g4, %o4, %g4 + mulx %g1, %o4, %g1 + srlx %o5, 32, %o1 + add %g5, %o1, %o1 + addcc %o1, %g4, %g4 + srl %o5, 0, %o0 + ldx [%g2+%g3], %o5 + sllx %g4, 32, %o1 + add %g1, %o2, %l1 + movlu %xcc, %l1, %g1 + add %o1, %o0, %l0 + addcc %l0, %i0, %g5 + srlx %g4, 32, %i0 + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + subcc %o5, %g5, %g5 + stx %g5, [%g2+%g3] + add %i0, 1, %g4 + movlu %xcc, %g4, %i0 + add %i2, -1, %i2 + add %i0, %g1, %i0 + brnz,pt %i2, L(top) + add %g3, 8, %g3 + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm new file mode 100644 index 0000000..0170746 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/add_n.asm @@ -0,0 +1,126 @@ +dnl SPARC v9 mpn_add_n for T3/T4. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 8 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') +define(`cy', `%i4') + +define(`u0_off', `%l2') +define(`u1_off', `%l3') +define(`loop_n', `%l6') +define(`tmp', `%l7') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_add_nc) + save %sp, -176, %sp + b,a L(ent) +EPILOGUE() +PROLOGUE(mpn_add_n) + save %sp, -176, %sp + + mov 0, cy +L(ent): + subcc n, 1, n + be L(final_one) + cmp %g0, cy + + ldx [up + 0], %o4 + sllx n, 3, tmp + + ldx [vp + 0], %o5 + add up, tmp, u0_off + + ldx [up + 8], %g5 + neg tmp, loop_n + + ldx [vp + 8], %g1 + add u0_off, 8, u1_off + + sub loop_n, -(2 * 8), loop_n + + brgez,pn loop_n, L(loop_tail) + add vp, (2 * 8), vp + + b,a L(top) + ALIGN(16) +L(top): + addxccc(%o4, %o5, tmp) + ldx [vp + 0], %o5 + + add rp, (2 * 8), rp + ldx [loop_n + u0_off], %o4 + + add vp, (2 * 8), vp + stx tmp, [rp - 16] + + addxccc(%g1, %g5, tmp) + ldx [vp - 8], %g1 + + ldx [loop_n + u1_off], %g5 + sub loop_n, -(2 * 8), loop_n + + brlz loop_n, L(top) + stx tmp, [rp - 8] + +L(loop_tail): + addxccc(%o4, %o5, %g3) + add loop_n, u0_off, up + + addxccc(%g1, %g5, %g5) + stx %g3, [rp + 0] + + brgz,pt loop_n, L(done) + stx %g5, [rp + 8] + + add rp, (2 * 8), rp +L(final_one): + ldx [up+0], %o4 + ldx [vp+0], %o5 + addxccc(%o4, %o5, %g3) + stx %g3, [rp+0] + +L(done): + addxc(%g0, %g0, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm new file mode 100644 index 0000000..939811e --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/addmul_1.asm @@ -0,0 +1,182 @@ +dnl SPARC v9 mpn_addmul_1 for T3/T4/T5. + +dnl Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 26 +C UltraSPARC T4: 4.5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +define(`u0', `%l0') +define(`u1', `%l1') +define(`u2', `%l2') +define(`u3', `%l3') +define(`r0', `%l4') +define(`r1', `%l5') +define(`r2', `%l6') +define(`r3', `%l7') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_addmul_1) + save %sp, -176, %sp + ldx [up+0], %g1 + + and n, 3, %g3 + brz %g3, L(b0) + addcc %g0, %g0, %g5 C clear carry limb, flag + cmp %g3, 2 + bcs %xcc, L(b01) + nop + be %xcc, L(b10) + ldx [up+8], %g5 + +L(b11): ldx [up+16], u3 + mulx %g1, v0, %o2 + umulxhi(%g1, v0, %o3) + ldx [rp+0], r1 + mulx %g5, v0, %o4 + ldx [rp+8], r2 + umulxhi(%g5, v0, %o5) + ldx [rp+16], r3 + mulx u3, v0, %g4 + umulxhi(u3, v0, %g5) + addcc %o3, %o4, %o4 + addxccc(%o5, %g4, %g4) + addxc( %g0, %g5, %g5) + addcc r1, %o2, r1 + stx r1, [rp+0] + addxccc(r2, %o4, r2) + stx r2, [rp+8] + addxccc(r3, %g4, r3) + stx r3, [rp+16] + add n, -3, n + add up, 24, up + brz n, L(xit) + add rp, 24, rp + b L(com) + nop + +L(b10): mulx %g1, v0, %o4 + ldx [rp+0], r2 + umulxhi(%g1, v0, %o5) + ldx [rp+8], r3 + mulx %g5, v0, %g4 + umulxhi(%g5, v0, %g5) + addcc %o5, %g4, %g4 + addxc( %g0, %g5, %g5) + addcc r2, %o4, r2 + stx r2, [rp+0] + addxccc(r3, %g4, r3) + stx r3, [rp+8] + add n, -2, n + add up, 16, up + brz n, L(xit) + add rp, 16, rp + b L(com) + nop + +L(b01): ldx [rp+0], r3 + mulx %g1, v0, %g4 + umulxhi(%g1, v0, %g5) + addcc r3, %g4, r3 + stx r3, [rp+0] + add n, -1, n + add up, 8, up + brz n, L(xit) + add rp, 8, rp + +L(com): ldx [up+0], %g1 +L(b0): ldx [up+8], u1 + ldx [up+16], u2 + ldx [up+24], u3 + mulx %g1, v0, %o0 + umulxhi(%g1, v0, %o1) + b L(lo0) + nop + + ALIGN(16) +L(top): ldx [up+0], u0 + addxc( %g0, %g5, %g5) C propagate carry into carry limb + ldx [up+8], u1 + addcc r0, %o0, r0 + ldx [up+16], u2 + addxccc(r1, %o2, r1) + ldx [up+24], u3 + addxccc(r2, %o4, r2) + stx r0, [rp-32] + addxccc(r3, %g4, r3) + stx r1, [rp-24] + mulx u0, v0, %o0 + stx r2, [rp-16] + umulxhi(u0, v0, %o1) + stx r3, [rp-8] +L(lo0): mulx u1, v0, %o2 + ldx [rp+0], r0 + umulxhi(u1, v0, %o3) + ldx [rp+8], r1 + mulx u2, v0, %o4 + ldx [rp+16], r2 + umulxhi(u2, v0, %o5) + ldx [rp+24], r3 + mulx u3, v0, %g4 + addxccc(%g5, %o0, %o0) + umulxhi(u3, v0, %g5) + add up, 32, up + addxccc(%o1, %o2, %o2) + add rp, 32, rp + addxccc(%o3, %o4, %o4) + add n, -4, n + addxccc(%o5, %g4, %g4) + brgz n, L(top) + nop + + addxc( %g0, %g5, %g5) + addcc r0, %o0, r0 + stx r0, [rp-32] + addxccc(r1, %o2, r1) + stx r1, [rp-24] + addxccc(r2, %o4, r2) + stx r2, [rp-16] + addxccc(r3, %g4, r3) + stx r3, [rp-8] +L(xit): addxc( %g0, %g5, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm new file mode 100644 index 0000000..ccc6a44 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_2.asm @@ -0,0 +1,228 @@ +dnl SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb +C mul_2 addmul_2 +C UltraSPARC T3: 22.5 23.5 +C UltraSPARC T4: 3.25 3.75 + + +C The code is reasonably scheduled but also relies on OoO. There was hope that +C this could run at around 3.0 and 3.5 c/l respectively, on T4. Two cycles per +C iteration needs to be removed. +C +C We could almost use 2-way unrolling, but currently the wN registers live too +C long. By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down- +C wards, 2-way unrolling should become possible. With n-indexed addressing it +C should run no slower. +C +C The rp loads to g1/g3 are very much over-scheduled. Presumably, they could +C be postponed a full way, and then just one register could be used. + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`vp', `%i3') + +define(`v0', `%o0') +define(`v1', `%o1') + +define(`w0', `%o2') +define(`w1', `%o3') +define(`w2', `%o4') +define(`w3', `%o5') + +ifdef(`OPERATION_mul_2',` + define(`AM2', `') + define(`ADDX', `addcc`'$1') + define(`func', `mpn_mul_2') +') +ifdef(`OPERATION_addmul_2',` + define(`AM2', `$1') + define(`ADDX', `addxccc($1,$2,$3)') + define(`func', `mpn_addmul_2') +') + + +MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2) + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + save %sp, -176, %sp + + ldx [vp+0], v0 C load v0 + and n, 3, %g5 + ldx [vp+8], v1 C load v1 + add n, -6, n + ldx [up+0], %g4 + brz %g5, L(b0) + cmp %g5, 2 + bcs L(b1) + nop + be L(b2) + nop + +L(b3): +AM2(` ldx [rp+0], %g1') + mulx %g4, v0, w2 + umulxhi(%g4, v0, w3) + ldx [up+8], %i5 + mulx %g4, v1, %l3 + umulxhi(%g4, v1, %l7) +AM2(` ldx [rp+8], %g3') + add up, -8, up + add rp, -8, rp + b L(lo3) + mov 0, w0 + +L(b2): +AM2(` ldx [rp+0], %g3') + mulx %g4, v0, w3 + umulxhi(%g4, v0, w0) + ldx [up+8], %i4 + mulx %g4, v1, %l1 + umulxhi(%g4, v1, %l5) +AM2(` ldx [rp+8], %g1') + add rp, 16, rp + brlz n, L(end) + mov 0, w1 + ba L(top) + add up, 16, up + +L(b1): +AM2(` ldx [rp+0], %g1') + mulx %g4, v0, w0 + umulxhi(%g4, v0, w1) + ldx [up+8], %i5 + mulx %g4, v1, %l3 + umulxhi(%g4, v1, %l7) +AM2(` ldx [rp+8], %g3') + add up, 8, up + add rp, 8, rp + b L(lo1) + mov 0, w2 + +L(b0): +AM2(` ldx [rp+0], %g3') + mulx %g4, v0, w1 + umulxhi(%g4, v0, w2) + ldx [up+8], %i4 + mulx %g4, v1, %l1 + umulxhi(%g4, v1, %l5) +AM2(` ldx [rp+8], %g1') + b L(lo0) + mov 0, w3 + + ALIGN(16) C cycle +L(top): mulx %i4, v0, %l2 C 0->5 + umulxhi(%i4, v0, %l6) C 0->5 + ldx [up+0], %i5 C 1->6 +AM2(` addcc w3, %g3, w3') C 1 + stx w3, [rp-16] C 2 + ADDX(` %l1, w0, w0') C 2 + addxccc(%l5, w1, w1) C 3 + mulx %i4, v1, %l3 C 3->9 + umulxhi(%i4, v1, %l7) C 4->9 +AM2(` ldx [rp+0], %g3') C 4 + addcc %l2, w0, w0 C 5 + addxccc(%l6, w1, w1) C 5 + addxc( %g0, %g0, w2) C 6 +L(lo1): mulx %i5, v0, %l0 C 6 + umulxhi(%i5, v0, %l4) C 7 + ldx [up+8], %i4 C 7 +AM2(` addcc w0, %g1, w0') C 8 + stx w0, [rp-8] C 8 + ADDX(` %l3, w1, w1') C 9 + addxccc(%l7, w2, w2) C 9 + mulx %i5, v1, %l1 C 10 + umulxhi(%i5, v1, %l5) C 10 +AM2(` ldx [rp+8], %g1') C 11 + addcc %l0, w1, w1 C 11 + addxccc(%l4, w2, w2) C 12 + addxc( %g0, %g0, w3) C 12 +L(lo0): mulx %i4, v0, %l2 C 13 + umulxhi(%i4, v0, %l6) C 13 + ldx [up+16], %i5 C 14 +AM2(` addcc w1, %g3, w1') C 14 + stx w1, [rp+0] C 15 + ADDX(` %l1, w2, w2') C 15 + addxccc(%l5, w3, w3) C 16 + mulx %i4, v1, %l3 C 16 + umulxhi(%i4, v1, %l7) C 17 +AM2(` ldx [rp+16], %g3') C 17 + addcc %l2, w2, w2 C 18 + addxccc(%l6, w3, w3) C 18 + addxc( %g0, %g0, w0) C 19 +L(lo3): mulx %i5, v0, %l0 C 19 + umulxhi(%i5, v0, %l4) C 20 + ldx [up+24], %i4 C 20 +AM2(` addcc w2, %g1, w2') C 21 + stx w2, [rp+8] C 21 + ADDX(` %l3, w3, w3') C 22 + addxccc(%l7, w0, w0) C 22 + mulx %i5, v1, %l1 C 23 + umulxhi(%i5, v1, %l5) C 23 +AM2(` ldx [rp+24], %g1') C 24 + addcc %l0, w3, w3 C 24 + addxccc(%l4, w0, w0) C 25 + addxc( %g0, %g0, w1) C 25 + add up, 32, up + add rp, 32, rp + brgz n, L(top) + add n, -4, n + +L(end): mulx %i4, v0, %l2 + umulxhi(%i4, v0, %l6) +AM2(` addcc w3, %g3, w3') + stx w3, [rp-16] + ADDX(` %l1, w0, w0') + addxccc(%l5, w1, w1) + mulx %i4, v1, %l3 + umulxhi(%i4, v1, %l7) + addcc %l2, w0, w0 + addxccc(%l6, w1, w1) + addxc( %g0, %g0, w2) +AM2(` addcc w0, %g1, w0') + stx w0, [rp-8] + ADDX(` %l3, w1, w1') + stx w1, [rp+0] + addxc(%l7, w2, %i0) + + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm new file mode 100644 index 0000000..845f6d6 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aormul_4.asm @@ -0,0 +1,219 @@ +dnl SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb +C mul_4 addmul_4 +C UltraSPARC T3: 21.5 22.0 +C UltraSPARC T4: 2.625 2.75 + + +C The code is well-scheduled and relies on OoO very little. There is hope that +C this will run at around 2.5 and 2.75 c/l respectively, on T4. + +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`vp', `%i3') + +define(`v0', `%g1') +define(`v1', `%o7') +define(`v2', `%g2') +define(`v3', `%i3') + +define(`w0', `%o0') +define(`w1', `%o1') +define(`w2', `%o2') +define(`w3', `%o3') +define(`w4', `%o4') + +define(`r0', `%o5') + +define(`u0', `%i4') +define(`u1', `%i5') + +define(`rp0', `rp') +define(`rp1', `%g3') +define(`rp2', `%g4') +define(`up0', `up') +define(`up1', `%g5') + +ifdef(`OPERATION_mul_4',` + define(`AM4', `') + define(`ADDX', `addcc`'$1') + define(`func', `mpn_mul_4') +') +ifdef(`OPERATION_addmul_4',` + define(`AM4', `$1') + define(`ADDX', `addxccc($1,$2,$3)') + define(`func', `mpn_addmul_4') +') + + +MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4) + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + save %sp, -176, %sp + + ldx [up + 0], u1 C load up[0] early + andcc n, 1, %g0 C is n odd? + ldx [vp + 0], v0 + sllx n, 3, n + ldx [vp + 8], v1 + add n, -28, n + ldx [vp + 16], v2 + add rp, -16, rp + ldx [vp + 24], v3 + add up, n, up0 + add rp, n, rp0 + add up0, 8, up1 + add rp0, 8, rp1 + add rp0, 16, rp2 + mulx u1, v0, %l0 + mov 0, w0 + mulx u1, v1, %l1 + mov 0, w1 + mulx u1, v2, %l2 + mov 0, w2 + mulx u1, v3, %l3 + mov 0, w3 + + be L(evn) + neg n, n + +L(odd): mov u1, u0 + ldx [up1 + n], u1 +AM4(` ldx [rp2 + n], r0') + umulxhi(u0, v0, %l4) + umulxhi(u0, v1, %l5) + umulxhi(u0, v2, %l6) + umulxhi(u0, v3, %l7) + b L(mid) + add n, 8, n + +L(evn): ldx [up1 + n], u0 +AM4(` ldx [rp2 + n], r0') + umulxhi(u1, v0, %l4) + umulxhi(u1, v1, %l5) + umulxhi(u1, v2, %l6) + umulxhi(u1, v3, %l7) + add n, 16, n + + ALIGN(16) +L(top): addcc %l0, w0, w0 + mulx u0, v0, %l0 C w 0 + addxccc(%l1, w1, w1) + mulx u0, v1, %l1 C w 1 + addxccc(%l2, w2, w2) + mulx u0, v2, %l2 C w 2 + addxccc(%l3, w3, w3) + mulx u0, v3, %l3 C w 3 + ldx [up0 + n], u1 + addxc( %g0, %g0, w4) +AM4(` addcc r0, w0, w0') + stx w0, [rp0 + n] + ADDX(` %l4, w1, w0') + umulxhi(u0, v0, %l4) C w 1 +AM4(` ldx [rp1 + n], r0') + addxccc(%l5, w2, w1) + umulxhi(u0, v1, %l5) C w 2 + addxccc(%l6, w3, w2) + umulxhi(u0, v2, %l6) C w 3 + addxc( %l7, w4, w3) + umulxhi(u0, v3, %l7) C w 4 +L(mid): addcc %l0, w0, w0 + mulx u1, v0, %l0 C w 1 + addxccc(%l1, w1, w1) + mulx u1, v1, %l1 C w 2 + addxccc(%l2, w2, w2) + mulx u1, v2, %l2 C w 3 + addxccc(%l3, w3, w3) + mulx u1, v3, %l3 C w 4 + ldx [up1 + n], u0 + addxc( %g0, %g0, w4) +AM4(` addcc r0, w0, w0') + stx w0, [rp1 + n] + ADDX(` %l4, w1, w0') + umulxhi(u1, v0, %l4) C w 2 +AM4(` ldx [rp2 + n], r0') + addxccc(%l5, w2, w1) + umulxhi(u1, v1, %l5) C w 3 + addxccc(%l6, w3, w2) + umulxhi(u1, v2, %l6) C w 4 + addxc( %l7, w4, w3) + umulxhi(u1, v3, %l7) C w 5 + brlz n, L(top) + add n, 16, n + +L(end): addcc %l0, w0, w0 + mulx u0, v0, %l0 + addxccc(%l1, w1, w1) + mulx u0, v1, %l1 + addxccc(%l2, w2, w2) + mulx u0, v2, %l2 + addxccc(%l3, w3, w3) + mulx u0, v3, %l3 + addxc( %g0, %g0, w4) +AM4(` addcc r0, w0, w0') + stx w0, [rp0 + n] + ADDX(` %l4, w1, w0') + umulxhi(u0, v0, %l4) +AM4(` ldx [rp1 + n], r0') + addxccc(%l5, w2, w1) + umulxhi(u0, v1, %l5) + addxccc(%l6, w3, w2) + umulxhi(u0, v2, %l6) + addxc( %l7, w4, w3) + umulxhi(u0, v3, %l7) + addcc %l0, w0, w0 + addxccc(%l1, w1, w1) + addxccc(%l2, w2, w2) + addxccc(%l3, w3, w3) + addxc( %g0, %g0, w4) +AM4(` addcc r0, w0, w0') + stx w0, [rp1 + n] + ADDX(` %l4, w1, w0') + addxccc(%l5, w2, w1) + addxccc(%l6, w3, w2) + stx w0, [rp2 + n] + add n, 16, n + stx w1, [rp1 + n] + stx w2, [rp2 + n] + addxc( %l7, w4, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm new file mode 100644 index 0000000..1014b1b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/aorslsh_n.asm @@ -0,0 +1,147 @@ +dnl SPARC v9 mpn_addlsh_n and mpn_sublsh_n for T3/T4/T5. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 11 +C UltraSPARC T4: 4 + +C For sublsh_n we combine the two shifted limbs using xnor, using the identity +C (a xor not b) = (not (a xor b)) which equals (not (a or b)) when (a and b) = +C 0 as it is in our usage. This gives us the ones complement for free. +C Unfortunately, the same trick will not work for rsblsh_n, which will instead +C require a separate negation. +C +C FIXME: Add rsblsh_n to this file. + +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') +define(`cnt',`%i4') + +define(`tnc',`%o5') + +ifdef(`OPERATION_addlsh_n',` + define(`INITCY', `subcc %g0, 0, %g0') + define(`MERGE', `or') + define(`func', `mpn_addlsh_n') +') +ifdef(`OPERATION_sublsh_n',` + define(`INITCY', `subcc %g0, 1, %g0') + define(`MERGE', `xnor') + define(`func', `mpn_sublsh_n') +') + +define(`rp0', `rp') +define(`rp1', `%o2') +define(`up0', `up') +define(`up1', `%o3') +define(`vp0', `vp') +define(`vp1', `%o4') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_sublsh_n) +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + save %sp, -176, %sp + mov 64, tnc + sub tnc, cnt, tnc + + andcc n, 1, %g0 + sllx n, 3, n + add n, -16, n + add up, n, up0 + add vp, n, vp0 + add rp, n, rp0 + add up0, 8, up1 + add vp0, 8, vp1 + add rp0, -8, rp1 + add rp0, -16, rp0 + neg n, n + be L(evn) + INITCY + +L(odd): ldx [vp0 + n], %l1 + mov 0, %l2 + ldx [up0 + n], %l5 + sllx %l1, cnt, %g3 + brgez n, L(wd1) + add n, 8, n + ldx [vp0 + n], %l0 + b L(lo1) + sllx %l1, cnt, %g3 + +L(evn): ldx [vp0 + n], %l0 + mov 0, %l3 + ldx [up0 + n], %l4 + ldx [vp1 + n], %l1 + b L(lo0) + sllx %l0, cnt, %g1 + +L(top): addxccc(%l6, %l4, %o0) + ldx [vp0 + n], %l0 + sllx %l1, cnt, %g3 + stx %o0, [rp0 + n] +L(lo1): srlx %l1, tnc, %l3 + MERGE %l2, %g3, %l7 + ldx [up0 + n], %l4 + addxccc(%l7, %l5, %o1) + ldx [vp1 + n], %l1 + sllx %l0, cnt, %g1 + stx %o1, [rp1 + n] +L(lo0): srlx %l0, tnc, %l2 + MERGE %l3, %g1, %l6 + ldx [up1 + n], %l5 + brlz,pt n, L(top) + add n, 16, n + + addxccc(%l6, %l4, %o0) + sllx %l1, cnt, %g3 + stx %o0, [rp0 + n] +L(wd1): srlx %l1, tnc, %l3 + MERGE %l2, %g3, %l7 + addxccc(%l7, %l5, %o1) + stx %o1, [rp1 + n] + +ifdef(`OPERATION_addlsh_n', +` addxc( %l3, %g0, %i0)') +ifdef(`OPERATION_sublsh_n', +` addxc( %g0, %g0, %g1) + add %g1, -1, %g1 + sub %l3, %g1, %i0') + + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm new file mode 100644 index 0000000..550860d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm @@ -0,0 +1,147 @@ +dnl SPARC T3/T4/T5 mpn_bdiv_dbm1c. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 25 +C UltraSPARC T4/T5: 4 + +C INPUT PARAMETERS +define(`qp', `%i0') +define(`ap', `%i1') +define(`n', `%i2') +define(`bd', `%i3') +define(`h', `%i4') + +define(`plo0',`%g4') define(`plo1',`%g5') +define(`phi0',`%l0') define(`phi1',`%l1') +define(`a0', `%g1') define(`a1', `%g3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_bdiv_dbm1c) + save %sp, -176, %sp + + and n, 3, %g5 + ldx [ap + 0], %g2 + add n, -5, n + brz %g5, L(b0) + cmp %g5, 2 + bcs %xcc, L(b1) + nop + be %xcc, L(b2) + nop + +L(b3): ldx [ap + 8], a0 + mulx bd, %g2, plo1 + umulxhi(bd, %g2, phi1) + ldx [ap + 16], a1 + add qp, -24, qp + b L(lo3) + add ap, -8, ap + +L(b2): ldx [ap + 8], a1 + mulx bd, %g2, plo0 + umulxhi(bd, %g2, phi0) + brlz,pt n, L(wd2) + nop +L(gt2): ldx [ap + 16], a0 + add ap, 16, ap + b L(lo2) + add n, -1, n + +L(b1): mulx bd, %g2, plo1 + umulxhi(bd, %g2, phi1) + brlz,pn n, L(wd1) + add qp, -8, qp +L(gt1): ldx [ap + 8], a0 + ldx [ap + 16], a1 + b L(lo1) + add ap, 8, ap + +L(b0): ldx [ap + 8], a1 + mulx bd, %g2, plo0 + umulxhi(bd, %g2, phi0) + ldx [ap + 16], a0 + b L(lo0) + add qp, -16, qp + +L(top): ldx [ap + 0], a0 + sub h, phi1, h +L(lo2): mulx bd, a1, plo1 + umulxhi(bd, a1, phi1) + subcc h, plo0, h + addxc( phi0, %g0, phi0) + stx h, [qp + 0] + ldx [ap + 8], a1 + sub h, phi0, h +L(lo1): mulx bd, a0, plo0 + umulxhi(bd, a0, phi0) + subcc h, plo1, h + addxc( phi1, %g0, phi1) + stx h, [qp + 8] + ldx [ap + 16], a0 + sub h, phi1, h +L(lo0): mulx bd, a1, plo1 + umulxhi(bd, a1, phi1) + subcc h, plo0, h + addxc( phi0, %g0, phi0) + stx h, [qp + 16] + ldx [ap + 24], a1 + sub h, phi0, h +L(lo3): mulx bd, a0, plo0 + umulxhi(bd, a0, phi0) + subcc h, plo1, h + addxc( phi1, %g0, phi1) + stx h, [qp + 24] + add ap, 32, ap + add qp, 32, qp + brgz,pt n, L(top) + add n, -4, n + +L(end): sub h, phi1, h +L(wd2): mulx bd, a1, plo1 + umulxhi(bd, a1, phi1) + subcc h, plo0, h + addxc( phi0, %g0, phi0) + stx h, [qp + 0] + sub h, phi0, h +L(wd1): subcc h, plo1, h + addxc( phi1, %g0, phi1) + stx h, [qp + 8] + sub h, phi1, %i0 + + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm new file mode 100644 index 0000000..9847047 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/bdiv_q_1.asm @@ -0,0 +1,137 @@ +dnl SPARC T3/T4/T5 mpn_bdiv_q_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 31 +C UltraSPARC T4/T5: 20-26 hits 20 early, then sharply drops + +C INPUT PARAMETERS +define(`qp', `%i0') +define(`ap', `%i1') +define(`n', `%i2') +define(`d', `%i3') +define(`dinv',`%i4') +define(`cnt', `%i5') + +define(`tnc', `%o2') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_bdiv_q_1) + save %sp, -176, %sp + ldx [ap], %o5 + add d, -1, %g1 + andn %g1, d, %g1 + popc %g1, cnt + + srlx d, cnt, d + srlx d, 1, %g1 + and %g1, 127, %g1 + LEA64(binvert_limb_table, g2, g4) + ldub [%g2+%g1], %g1 + add %g1, %g1, %g2 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %g2, %g1, %g2 + add %g2, %g2, %g1 + mulx %g2, %g2, %g2 + mulx %g2, d, %g2 + sub %g1, %g2, %g1 + add %g1, %g1, %o7 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + add n, -2, n + brz,pt cnt, L(norm) + sub %o7, %g1, dinv + + brlz,pt n, L(edu) + srlx %o5, cnt, %o5 + b L(eee) + mov 0, %g4 +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + save %sp, -176, %sp + ldx [ap], %o5 + + brz,pt cnt, L(norm) + add n, -2, n + +L(unorm): + brlz,pt n, L(edu) + srlx %o5, cnt, %o5 + mov 0, %g4 +L(eee): sub %g0, cnt, tnc + +L(tpu): ldx [ap+8], %g3 + add ap, 8, ap + sllx %g3, tnc, %g5 + or %g5, %o5, %g5 + srlx %g3, cnt, %o5 + subcc %g5, %g4, %g4 + mulx %g4, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + umulxhi(d, %g1, %g1) + addxc( %g1, %g0, %g4) + brgz,pt n, L(tpu) + add n, -1, n + + sub %o5, %g4, %o5 +L(edu): mulx %o5, dinv, %g1 + return %i7+8 + stx %g1, [%o0] + +L(norm): + mulx dinv, %o5, %g1 + brlz,pt n, L(edn) + stx %g1, [qp] + add qp, 8, qp + addcc %g0, 0, %g4 + +L(tpn): umulxhi(d, %g1, %g1) + ldx [ap+8], %g5 + add ap, 8, ap + addxc( %g1, %g0, %g1) + subcc %g5, %g1, %g1 + mulx %g1, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + brgz,pt n, L(tpn) + add n, -1, n + +L(edn): return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm new file mode 100644 index 0000000..49ccaec --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/cnd_aors_n.asm @@ -0,0 +1,145 @@ +dnl SPARC v9 mpn_cnd_add_n and mpn_cnd_sub_n for T3/T4/T5. + +dnl Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl Copyright 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 8.5 +C UltraSPARC T4: 3 + +C We use a double-pointer trick to allow indexed addressing. Its setup +C cost might be a problem in these functions, since we don't expect huge n +C arguments. +C +C For sub we need ~(a & mask) = (~a | ~mask) but by complementing mask we can +C instead do ~(a & ~mask) = (~a | mask), allowing us to use the orn insn. + +C INPUT PARAMETERS +define(`cnd', `%i0') +define(`rp', `%i1') +define(`up', `%i2') +define(`vp', `%i3') +define(`n', `%i4') + +define(`mask', `cnd') +define(`up0', `%l0') define(`up1', `%l1') +define(`vp0', `%l2') define(`vp1', `%l3') +define(`rp0', `%g4') define(`rp1', `%g5') +define(`u0', `%l4') define(`u1', `%l5') +define(`v0', `%l6') define(`v1', `%l7') +define(`x0', `%g1') define(`x1', `%g3') +define(`w0', `%g1') define(`w1', `%g3') + +ifdef(`OPERATION_cnd_add_n',` + define(`LOGOP', `and $1, $2, $3') + define(`MAKEMASK',`cmp %g0, $1 + addxc( %g0, %g0, $2) + neg $2, $2') + define(`INITCY', `addcc %g0, 0, %g0') + define(`RETVAL', `addxc( %g0, %g0, %i0)') + define(`func', `mpn_cnd_add_n') +') +ifdef(`OPERATION_cnd_sub_n',` + define(`LOGOP', `orn $2, $1, $3') + define(`MAKEMASK',`cmp $1, 1 + addxc( %g0, %g0, $2) + neg $2, $2') + define(`INITCY', `subcc %g0, 1, %g0') + define(`RETVAL', `addxc( %g0, %g0, %i0) + xor %i0, 1, %i0') + define(`func', `mpn_cnd_sub_n') +') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(func) + save %sp, -176, %sp + + MAKEMASK(cnd,mask) + + andcc n, 1, %g0 + sllx n, 3, n + add n, -16, n + add vp, n, vp0 + add up, n, up0 + add rp, n, rp0 + neg n, n + be L(evn) + INITCY + +L(odd): ldx [vp0 + n], v1 + ldx [up0 + n], u1 + LOGOP( v1, mask, x1) + addxccc(u1, x1, w1) + stx w1, [rp0 + n] + add n, 8, n + brgz n, L(rtn) + nop + +L(evn): add vp0, 8, vp1 + add up0, 8, up1 + add rp0, -24, rp1 + ldx [vp0 + n], v0 + ldx [vp1 + n], v1 + ldx [up0 + n], u0 + ldx [up1 + n], u1 + add n, 16, n + brgz n, L(end) + add rp0, -16, rp0 + +L(top): LOGOP( v0, mask, x0) + ldx [vp0 + n], v0 + LOGOP( v1, mask, x1) + ldx [vp1 + n], v1 + addxccc(u0, x0, w0) + ldx [up0 + n], u0 + addxccc(u1, x1, w1) + ldx [up1 + n], u1 + stx w0, [rp0 + n] + add n, 16, n + brlez n, L(top) + stx w1, [rp1 + n] + +L(end): LOGOP( v0, mask, x0) + LOGOP( v1, mask, x1) + addxccc(u0, x0, w0) + addxccc(u1, x1, w1) + stx w0, [rp0 + n] + stx w1, [rp1 + 32] + +L(rtn): RETVAL + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm new file mode 100644 index 0000000..d7dbdf9 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/dive_1.asm @@ -0,0 +1,129 @@ +dnl SPARC T3/T4/T5 mpn_divexact_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 31 +C UltraSPARC T4/T5: 20-26 hits 20 early, then sharply drops + +C INPUT PARAMETERS +define(`qp', `%i0') +define(`ap', `%i1') +define(`n', `%i2') +define(`d', `%i3') + +define(`dinv',`%o4') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_divexact_1) + save %sp, -176, %sp + cmp n, 1 + bne,pt %xcc, L(gt1) + ldx [ap], %o5 + udivx %o5, d, %g1 + stx %g1, [qp] + return %i7+8 + nop + +L(gt1): add d, -1, %g1 + andn %g1, d, %g1 + popc %g1, %i4 C i4 = count_trailing_zeros(d) + + srlx d, %i4, d + srlx d, 1, %g1 + and %g1, 127, %g1 + + LEA64(binvert_limb_table, g2, g4) + ldub [%g2+%g1], %g1 + add %g1, %g1, %g2 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %g2, %g1, %g2 + add %g2, %g2, %g1 + mulx %g2, %g2, %g2 + mulx %g2, d, %g2 + sub %g1, %g2, %g1 + add %g1, %g1, %o7 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + add n, -2, n + brz,pt %i4, L(norm) + sub %o7, %g1, dinv + +L(unnorm): + mov 0, %g4 + sub %g0, %i4, %o2 + srlx %o5, %i4, %o5 +L(top_unnorm): + ldx [ap+8], %g3 + add ap, 8, ap + sllx %g3, %o2, %g5 + or %g5, %o5, %g5 + srlx %g3, %i4, %o5 + subcc %g5, %g4, %g4 + mulx %g4, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + umulxhi(d, %g1, %g1) + addxc( %g1, %g0, %g4) + brgz,pt n, L(top_unnorm) + add n, -1, n + + sub %o5, %g4, %g4 + mulx %g4, dinv, %g1 + stx %g1, [qp] + return %i7+8 + nop + +L(norm): + mulx dinv, %o5, %g1 + stx %g1, [qp] + add qp, 8, qp + addcc %g0, 0, %g4 +L(top_norm): + umulxhi(d, %g1, %g1) + ldx [ap+8], %g5 + add ap, 8, ap + addxc( %g1, %g0, %g1) + subcc %g5, %g1, %g1 + mulx %g1, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + brgz,pt n, L(top_norm) + add n, -1, n + + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm new file mode 100644 index 0000000..20ed8bf --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/hamdist.asm @@ -0,0 +1,78 @@ +dnl SPARC v9 mpn_hamdist for T3/T4. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 18 +C UltraSPARC T4: 3.5 + +C INPUT PARAMETERS +define(`up', `%o0') +define(`vp', `%o1') +define(`n', `%o2') +define(`pcnt', `%o5') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_hamdist) + subcc n, 1, n + be L(final_one) + clr pcnt +L(top): + ldx [up + 0], %g1 + ldx [vp + 0], %g2 + ldx [up + 8], %o4 + ldx [vp + 8], %g3 + sub n, 2, n + xor %g1, %g2, %g1 + add up, 16, up + popc %g1, %g2 + add vp, 16, vp + xor %o4, %g3, %o4 + add pcnt, %g2, pcnt + popc %o4, %g3 + brgz n, L(top) + add pcnt, %g3, pcnt + brlz,pt n, L(done) + nop +L(final_one): + ldx [up + 0], %g1 + ldx [vp + 0], %g2 + xor %g1,%g2, %g1 + popc %g1, %g2 + add pcnt, %g2, pcnt +L(done): + retl + mov pcnt, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm new file mode 100644 index 0000000..4da49cf --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/invert_limb.asm @@ -0,0 +1,92 @@ +dnl SPARC T3/T4/T5 mpn_invert_limb. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: ? +C UltraSPARC T4/T5: ? + +C INPUT PARAMETERS +define(`d', `%o0') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_invert_limb) + srlx d, 54, %g1 + LEA64(approx_tab, g2, g3) + and %g1, 0x1fe, %g1 + srlx d, 24, %g4 + lduh [%g2+%g1], %g3 + add %g4, 1, %g4 + sllx %g3, 11, %g2 + add %g2, -1, %g2 + mulx %g3, %g3, %g3 + mulx %g3, %g4, %g3 + srlx %g3, 40, %g3 + sub %g2, %g3, %g2 + sllx %g2, 60, %g1 + mulx %g2, %g2, %g3 + mulx %g3, %g4, %g4 + sub %g1, %g4, %g1 + srlx %g1, 47, %g1 + sllx %g2, 13, %g2 + add %g1, %g2, %g1 + and d, 1, %g2 + srlx %g1, 1, %g4 + sub %g0, %g2, %g3 + and %g4, %g3, %g3 + srlx d, 1, %g4 + add %g4, %g2, %g2 + mulx %g1, %g2, %g2 + sub %g3, %g2, %g2 + umulxhi(%g1, %g2, %g2) + srlx %g2, 1, %g2 + sllx %g1, 31, %g1 + add %g2, %g1, %g1 + mulx %g1, d, %g3 + umulxhi(d, %g1, %g4) + addcc %g3, d, %g0 + addxc( %g4, d, %o0) + jmp %o7+8 + sub %g1, %o0, %o0 +EPILOGUE() + + RODATA + ALIGN(2) + TYPE( approx_tab, object) + SIZE( approx_tab, 512) +approx_tab: +forloop(i,256,512-1,dnl +` .half eval(0x7fd00/i) +')dnl diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm new file mode 100644 index 0000000..c79032d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.asm @@ -0,0 +1,77 @@ +dnl SPARC v9-2011 simulation support. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(__gmpn_umulh) + save %sp, -176, %sp + ldx [%sp+2047+176+256], %o0 + ldx [%sp+2047+176+256+8], %o1 + rd %ccr, %o4 + srl %o0, 0, %l4 + srl %o1, 0, %l1 + srlx %o1, 32, %o1 + mulx %o1, %l4, %l2 + srlx %o0, 32, %o0 + mulx %o0, %l1, %l3 + mulx %l1, %l4, %l1 + srlx %l1, 32, %l1 + add %l2, %l1, %l2 + addcc %l2, %l3, %l2 + mulx %o1, %o0, %o1 + mov 0, %l1 + movcs %xcc, 1, %l1 + sllx %l1, 32, %l1 + add %o1, %l1, %o1 + srlx %l2, 32, %o0 + add %o1, %o0, %o0 + stx %o0, [%sp+2047+176+256] + wr %o4, 0, %ccr + ret + restore +EPILOGUE() + +PROLOGUE(__gmpn_lzcnt) + save %sp, -176, %sp + ldx [%sp+2047+176+256], %o0 + brz,a %o0, 2f + mov 64, %o1 + brlz %o0, 2f + mov 0, %o1 +1: sllx %o0, 1, %o0 + brgz %o0, 1b + add %o1, 1, %o1 + stx %o1, [%sp+2047+176+256] +2: ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 b/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 new file mode 100644 index 0000000..e5d6d8e --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/missing.m4 @@ -0,0 +1,88 @@ +dnl SPARC v9-2011 simulation support. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Usage addxccc(r1,r2,r3, t1) +dnl 64-bit add with carry-in and carry-out +dnl FIXME: Register g2 must not be destination + +define(`addxccc',`dnl + add %sp, -512, %sp + stx %g2, [%sp+2047+256+16] + mov 0, %g2 + movcs %xcc, -1, %g2 + addcc %g2, 1, %g0 + addccc $1, $2, $3 + ldx [%sp+2047+256+16], %g2 + sub %sp, -512, %sp +') + + +dnl Usage addxc(r1,r2,r3, t1,t2) +dnl 64-bit add with carry-in + +define(`addxc',`dnl + bcc %xcc, 1f + add $1, $2, $3 + add $3, 1, $3 +1: +') + + +dnl Usage umulxhi(r1,r2,r3) +dnl 64-bit multiply returning upper 64 bits +dnl Calls __gmpn_umulh using a non-standard calling convention + +define(`umulxhi',`dnl + add %sp, -512, %sp + stx $1, [%sp+2047+256] + stx $2, [%sp+2047+256+8] + stx %o7, [%sp+2047+256+16] + call __gmpn_umulh + nop + ldx [%sp+2047+256+16], %o7 + ldx [%sp+2047+256], $3 + sub %sp, -512, %sp +') +dnl Usage lzcnt(r1,r2) +dnl Plain count leading zeros +dnl Calls __gmpn_lzcnt using a non-standard calling convention + +define(`lzcnt',`dnl + add %sp, -512, %sp + stx %o7, [%sp+2047+256+16] + call __gmpn_lzcnt + stx $1, [%sp+2047+256] + ldx [%sp+2047+256+16], %o7 + ldx [%sp+2047+256], $2 + sub %sp, -512, %sp +') diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm new file mode 100644 index 0000000..08facbd --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_1_4.asm @@ -0,0 +1,233 @@ +dnl SPARC T3/T4/T5 mpn_mod_1s_4p. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 30 +C UltraSPARC T4/T5: 4 + +C INPUT PARAMETERS +define(`ap', `%o0') +define(`n', `%o1') +define(`d', `%o2') +define(`cps', `%o3') + + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_mod_1s_4p) + save %sp, -176, %sp + ldx [%i3+16], %o4 + ldx [%i3+24], %o3 + ldx [%i3+32], %o2 + ldx [%i3+40], %o1 + ldx [%i3+48], %o0 + + and %i1, 3, %g3 + sllx %i1, 3, %g1 + add %i0, %g1, %i0 + brz %g3, L(b00) + cmp %g3, 2 + bcs %xcc, L(b01) + nop + be %xcc, L(b10) + nop + +L(b11): ldx [%i0-16], %g2 + mulx %g2, %o4, %g5 + umulxhi(%g2, %o4, %g3) + ldx [%i0-24], %g4 + addcc %g5, %g4, %g5 + addxc( %g3, %g0, %g4) + ldx [%i0-8], %g2 + mulx %g2, %o3, %g1 + umulxhi(%g2, %o3, %g3) + addcc %g1, %g5, %g1 + addxc( %g3, %g4, %g2) + ba,pt %xcc, .L8 + add %i0, -32, %i0 + +L(b00): ldx [%i0-24], %g3 + mulx %g3, %o4, %g2 + umulxhi(%g3, %o4, %g5) + ldx [%i0-32], %g4 + addcc %g2, %g4, %g2 + addxc( %g5, %g0, %g3) + ldx [%i0-16], %g4 + mulx %g4, %o3, %g5 + umulxhi(%g4, %o3, %i5) + addcc %g2, %g5, %g5 + addxc( %g3, %i5, %g4) + ldx [%i0-8], %g2 + mulx %g2, %o2, %g1 + umulxhi(%g2, %o2, %g3) + addcc %g1, %g5, %g1 + addxc( %g3, %g4, %g2) + ba,pt %xcc, .L8 + add %i0, -40, %i0 + +L(b01): ldx [%i0-8], %g1 + mov 0, %g2 + ba,pt %xcc, .L8 + add %i0, -16, %i0 + +L(b10): ldx [%i0-8], %g2 + ldx [%i0-16], %g1 + add %i0, -24, %i0 + +.L8: add %i1, -5, %g3 + brlz,pn %g3, L(end) + nop + +L(top): ldx [%i0-16], %i4 + mulx %i4, %o4, %o5 + umulxhi(%i4, %o4, %i1) + ldx [%i0-24], %i5 + addcc %o5, %i5, %o5 + addxc( %i1, %g0, %i4) + ldx [%i0-8], %i5 + mulx %i5, %o3, %o7 + umulxhi(%i5, %o3, %i1) + addcc %o5, %o7, %o7 + addxc( %i4, %i1, %i5) + ldx [%i0+0], %g4 + mulx %g4, %o2, %i1 + umulxhi(%g4, %o2, %i4) + addcc %o7, %i1, %i1 + addxc( %i5, %i4, %g4) + mulx %g1, %o1, %i5 + umulxhi(%g1, %o1, %i4) + addcc %i1, %i5, %i5 + addxc( %g4, %i4, %g5) + mulx %g2, %o0, %g1 + umulxhi(%g2, %o0, %g4) + addcc %g1, %i5, %g1 + addxc( %g4, %g5, %g2) + add %g3, -4, %g3 + brgez,pt %g3, L(top) + add %i0, -32, %i0 + +L(end): mulx %g2, %o4, %g5 + umulxhi(%g2, %o4, %g3) + addcc %g1, %g5, %g5 + addxc( %g3, %g0, %g2) + ldx [%i3+8], %i0 + ldx [%i3], %g4 + sub %g0, %i0, %i5 + srlx %g5, %i5, %i5 + sllx %g2, %i0, %g2 + or %i5, %g2, %g1 + mulx %g1, %g4, %l7 + umulxhi(%g1, %g4, %g3) + sllx %g5, %i0, %g2 + add %g1, 1, %g1 + addcc %l7, %g2, %g5 + addxc( %g3, %g1, %g1) + mulx %g1, %i2, %g1 + sub %g2, %g1, %g2 + cmp %g2, %g5 + add %i2, %g2, %g1 + movlu %xcc, %g2, %g1 + subcc %g1, %i2, %g2 + movgeu %xcc, %g2, %g1 + return %i7+8 + srlx %g1, %o0, %o0 +EPILOGUE() + +PROLOGUE(mpn_mod_1s_4p_cps) + save %sp, -176, %sp + lzcnt( %i1, %i5) + sllx %i1, %i5, %i1 + call mpn_invert_limb, 0 + mov %i1, %o0 + stx %o0, [%i0] + sra %i5, 0, %g1 + stx %g1, [%i0+8] + sub %g0, %i5, %g2 + srlx %o0, %g2, %g2 + mov 1, %g1 + sllx %g1, %i5, %g1 + or %g2, %g1, %g2 + sub %g0, %i1, %g1 + mulx %g2, %g1, %g2 + srlx %g2, %i5, %g1 + stx %g1, [%i0+16] + + umulxhi(%o0, %g2, %g3) + add %g2, %g3, %g3 + xnor %g0, %g3, %g3 + mulx %g3, %i1, %g3 + mulx %g2, %o0, %g2 + cmp %g2, %g3 + add %i1, %g3, %g1 + movgeu %xcc, %g3, %g1 + srlx %g1, %i5, %g2 + stx %g2, [%i0+24] + + umulxhi(%o0, %g1, %g3) + add %g1, %g3, %g3 + xnor %g0, %g3, %g3 + mulx %g3, %i1, %g3 + mulx %g1, %o0, %g1 + cmp %g1, %g3 + add %i1, %g3, %g2 + movgeu %xcc, %g3, %g2 + srlx %g2, %i5, %g1 + stx %g1, [%i0+32] + + umulxhi(%o0, %g2, %g3) + add %g2, %g3, %g3 + xnor %g0, %g3, %g3 + mulx %g3, %i1, %g3 + mulx %g2, %o0, %g2 + cmp %g2, %g3 + add %i1, %g3, %g1 + movgeu %xcc, %g3, %g1 + srlx %g1, %i5, %g2 + stx %g2, [%i0+40] + + umulxhi(%o0, %g1, %g2) + add %g1, %g2, %g2 + xnor %g0, %g2, %g2 + mulx %g2, %i1, %g2 + mulx %g1, %o0, %o0 + cmp %o0, %g2 + add %i1, %g2, %g3 + movgeu %xcc, %g2, %g3 + srlx %g3, %i5, %i5 + stx %i5, [%i0+48] + + return %i7+8 + nop +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm new file mode 100644 index 0000000..8744280 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mod_34lsub1.asm @@ -0,0 +1,117 @@ +dnl SPARC v9 mpn_mod_34lsub1 for T3/T4/T5. + +dnl Copyright 2005, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T1: - +C UltraSPARC T3: 5 +C UltraSPARC T4: 1.57 + +C This is based on the powerpc64/mode64 code. + +C INPUT PARAMETERS +define(`up', `%i0') +define(`n', `%i1') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_mod_34lsub1) + save %sp, -176, %sp + + mov 0, %g1 + mov 0, %g3 + mov 0, %g4 + addcc %g0, 0, %g5 + + add n, -3, n + brlz n, L(lt3) + nop + + add n, -3, n + ldx [up+0], %l5 + ldx [up+8], %l6 + ldx [up+16], %l7 + brlz n, L(end) + add up, 24, up + + ALIGN(16) +L(top): addxccc(%g1, %l5, %g1) + ldx [up+0], %l5 + addxccc(%g3, %l6, %g3) + ldx [up+8], %l6 + addxccc(%g4, %l7, %g4) + ldx [up+16], %l7 + add n, -3, n + brgez n, L(top) + add up, 24, up + +L(end): addxccc( %g1, %l5, %g1) + addxccc(%g3, %l6, %g3) + addxccc(%g4, %l7, %g4) + addxc( %g5, %g0, %g5) + +L(lt3): cmp n, -2 + blt L(2) + nop + + ldx [up+0], %l5 + mov 0, %l6 + beq L(1) + addcc %g1, %l5, %g1 + + ldx [up+8], %l6 +L(1): addxccc(%g3, %l6, %g3) + addxccc(%g4, %g0, %g4) + addxc( %g5, %g0, %g5) + +L(2): sllx %g1, 16, %l0 + srlx %l0, 16, %l0 C %l0 = %g1 mod 2^48 + srlx %g1, 48, %l3 C %l3 = %g1 div 2^48 + srl %g3, 0, %g1 + sllx %g1, 16, %l4 C %l4 = (%g3 mod 2^32) << 16 + srlx %g3, 32, %l5 C %l5 = %g3 div 2^32 + sethi %hi(0xffff0000), %g1 + andn %g4, %g1, %g1 + sllx %g1, 32, %l6 C %l6 = (%g4 mod 2^16) << 32 + srlx %g4, 16, %l7 C %l7 = %g4 div 2^16 + + add %l0, %l3, %l0 + add %l4, %l5, %l4 + add %l6, %l7, %l6 + + add %l0, %l4, %l0 + add %l6, %g5, %l6 + + add %l0, %l6, %i0 + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm new file mode 100644 index 0000000..494e1d3 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mode1o.asm @@ -0,0 +1,82 @@ +dnl SPARC T3/T4/T5 mpn_modexact_1c_odd. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 30 +C UltraSPARC T4/T5: 26 + +C INPUT PARAMETERS +define(`ap', `%o0') +define(`n', `%o1') +define(`d', `%o2') +define(`cy', `%o3') + +define(`dinv',`%o5') +define(`a0', `%g1') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_modexact_1c_odd) + srlx d, 1, %g1 + and %g1, 127, %g1 + + LEA64(binvert_limb_table, g2, g4) + ldub [%g2+%g1], %g1 + add %g1, %g1, %g2 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %g2, %g1, %g2 + add %g2, %g2, %g1 + mulx %g2, %g2, %g2 + mulx %g2, d, %g2 + sub %g1, %g2, %g1 + add %g1, %g1, %o5 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %o5, %g1, dinv + add n, -1, n + +L(top): ldx [ap], a0 + add ap, 8, ap + subcc a0, cy, %g3 + mulx %g3, dinv, %g5 + umulxhi(d, %g5, %g5) + addxc( %g5, %g0, cy) + brnz,pt n, L(top) + add n, -1, n + + retl + mov cy, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm new file mode 100644 index 0000000..af05d62 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/mul_1.asm @@ -0,0 +1,174 @@ +dnl SPARC v9 mpn_mul_1 for T3/T4/T5. + +dnl Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 23 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_mul_1) + save %sp, -176, %sp + + and n, 3, %g5 + add n, -4, n + brz %g5, L(b0) + cmp %g5, 2 + bcs %xcc, L(b1) + nop + be %xcc, L(b2) + nop + +L(b3): addcc %g0, %g0, %i5 + ldx [up+0], %l0 + ldx [up+8], %l1 + ldx [up+16], %l2 + mulx %l0, v0, %o0 + umulxhi(%l0, v0, %o1) + brgz n, L(gt3) + add rp, -8, rp + mulx %l1, v0, %o2 + umulxhi(%l1, v0, %o3) + b L(wd3) + nop +L(gt3): ldx [up+24], %l3 + mulx %l1, v0, %o2 + umulxhi(%l1, v0, %o3) + add up, 24, up + b L(lo3) + add n, -3, n + +L(b2): addcc %g0, %g0, %o1 + ldx [up+0], %l1 + ldx [up+8], %l2 + brgz n, L(gt2) + add rp, -16, rp + mulx %l1, v0, %o2 + umulxhi(%l1, v0, %o3) + mulx %l2, v0, %o4 + umulxhi(%l2, v0, %o5) + b L(wd2) + nop +L(gt2): ldx [up+16], %l3 + mulx %l1, v0, %o2 + umulxhi(%l1, v0, %o3) + ldx [up+24], %l0 + mulx %l2, v0, %o4 + umulxhi(%l2, v0, %o5) + add up, 16, up + b L(lo2) + add n, -2, n + +L(b1): addcc %g0, %g0, %o3 + ldx [up+0], %l2 + brgz n, L(gt1) + nop + mulx %l2, v0, %o4 + stx %o4, [rp+0] + umulxhi(%l2, v0, %i0) + ret + restore +L(gt1): ldx [up+8], %l3 + ldx [up+16], %l0 + mulx %l2, v0, %o4 + umulxhi(%l2, v0, %o5) + ldx [up+24], %l1 + mulx %l3, v0, %i4 + umulxhi(%l3, v0, %i5) + add rp, -24, rp + add up, 8, up + b L(lo1) + add n, -1, n + +L(b0): addcc %g0, %g0, %o5 + ldx [up+0], %l3 + ldx [up+8], %l0 + ldx [up+16], %l1 + mulx %l3, v0, %i4 + umulxhi(%l3, v0, %i5) + ldx [up+24], %l2 + mulx %l0, v0, %o0 + umulxhi(%l0, v0, %o1) + b L(lo0) + nop + + ALIGN(16) +L(top): ldx [up+0], %l3 C 0 + addxccc(%i4, %o5, %i4) C 0 + mulx %l1, v0, %o2 C 1 + stx %i4, [rp+0] C 1 + umulxhi(%l1, v0, %o3) C 2 +L(lo3): ldx [up+8], %l0 C 2 + addxccc(%o0, %i5, %o0) C 3 + mulx %l2, v0, %o4 C 3 + stx %o0, [rp+8] C 4 + umulxhi(%l2, v0, %o5) C 4 +L(lo2): ldx [up+16], %l1 C 5 + addxccc(%o2, %o1, %o2) C 5 + mulx %l3, v0, %i4 C 6 + stx %o2, [rp+16] C 6 + umulxhi(%l3, v0, %i5) C 7 +L(lo1): ldx [up+24], %l2 C 7 + addxccc(%o4, %o3, %o4) C 8 + mulx %l0, v0, %o0 C 8 + stx %o4, [rp+24] C 9 + umulxhi(%l0, v0, %o1) C 9 + add rp, 32, rp C 10 +L(lo0): add up, 32, up C 10 + brgz n, L(top) C 11 + add n, -4, n C 11 + +L(end): addxccc(%i4, %o5, %i4) + mulx %l1, v0, %o2 + stx %i4, [rp+0] + umulxhi(%l1, v0, %o3) + addxccc(%o0, %i5, %o0) +L(wd3): mulx %l2, v0, %o4 + stx %o0, [rp+8] + umulxhi(%l2, v0, %o5) + addxccc(%o2, %o1, %o2) +L(wd2): stx %o2, [rp+16] + addxccc(%o4, %o3, %o4) + stx %o4, [rp+24] + addxc( %g0, %o5, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm new file mode 100644 index 0000000..de80f3c --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/popcount.asm @@ -0,0 +1,70 @@ +dnl SPARC v9 mpn_popcount for T3/T4. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 15 +C UltraSPARC T4: 2.5 + +C INPUT PARAMETERS +define(`up', `%o0') +define(`n', `%o1') +define(`pcnt', `%o5') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_popcount) + subcc n, 1, n + be L(final_one) + clr pcnt +L(top): + ldx [up + 0], %g1 + sub n, 2, n + ldx [up + 8], %o4 + add up, 16, up + popc %g1, %g2 + popc %o4, %g3 + add pcnt, %g2, pcnt + brgz n, L(top) + add pcnt, %g3, pcnt + brlz,pt n, L(done) + nop +L(final_one): + ldx [up + 0], %g1 + popc %g1, %g2 + add pcnt, %g2, pcnt +L(done): + retl + mov pcnt, %o0 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..d46499f --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm @@ -0,0 +1,93 @@ +dnl SPARC v9 mpn_sqr_diag_addlsh1 for T3/T4/T5. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: ? +C UltraSPARC T4: >= 4.5 + + +define(`rp', `%i0') +define(`tp', `%i1') +define(`up', `%i2') +define(`n', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sqr_diag_addlsh1) + save %sp, -176, %sp + + ldx [up+0], %g1 + mulx %g1, %g1, %o0 + umulxhi(%g1, %g1, %g2) + stx %o0, [rp+0] + + ldx [up+8], %g1 + ldx [tp+0], %g4 + ldx [tp+8], %g5 + mulx %g1, %g1, %o0 + orcc %g0, %g0, %o5 + b L(dm) + add n, -2, n + + ALIGN(16) +L(top): ldx [up+8], %g1 + addcc %g4, %o2, %o2 + addxccc(%g5, %o0, %g3) + ldx [tp+16], %g4 + ldx [tp+24], %g5 + mulx %g1, %g1, %o0 + stx %o2, [rp+8] + stx %g3, [rp+16] + add rp, 16, rp + add tp, 16, tp +L(dm): add %g2, %o5, %o2 + umulxhi(%g1, %g1, %g2) + addxccc(%g4, %g4, %g4) + addxccc(%g5, %g5, %g5) + add up, 8, up + addxc( %g0, %g0, %o5) + brnz n, L(top) + add n, -1, n + + addcc %o2, %g4, %g4 + addxccc(%o0, %g5, %g5) + stx %g4, [rp+8] + stx %g5, [rp+16] + addxc( %o5, %g2, %g2) + stx %g2, [rp+24] + + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm new file mode 100644 index 0000000..0e4bc93 --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/sub_n.asm @@ -0,0 +1,144 @@ +dnl SPARC v9 mpn_sub_n for T3/T4. + +dnl Contributed to the GNU project by David Miller. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 8 +C UltraSPARC T4: 3 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`vp', `%i2') +define(`n', `%i3') +define(`cy', `%i4') + +define(`u0_off', `%l0') +define(`u1_off', `%l1') +define(`v0_off', `%l2') +define(`v1_off', `%l3') +define(`r0_off', `%l4') +define(`r1_off', `%l5') +define(`loop_n', `%l6') +define(`tmp', `%l7') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_sub_nc) + save %sp, -176, %sp + ba,pt %xcc, L(ent) + xor cy, 1, cy +EPILOGUE() +PROLOGUE(mpn_sub_n) + save %sp, -176, %sp + mov 1, cy +L(ent): + subcc n, 1, n + be L(final_one) + cmp %g0, cy + + ldx [up + 0], %o4 + sllx n, 3, tmp + + ldx [vp + 0], %o5 + add up, tmp, u0_off + + ldx [up + 8], %g5 + add vp, tmp, v0_off + + ldx [vp + 8], %g1 + add rp, tmp, r0_off + + neg tmp, loop_n + add u0_off, 8, u1_off + + add v0_off, 8, v1_off + sub loop_n, -(2 * 8), loop_n + + sub r0_off, 16, r0_off + brgez,pn loop_n, L(loop_tail) + sub r0_off, 8, r1_off + + b,a L(top) + ALIGN(16) +L(top): + xnor %o5, 0, tmp + ldx [loop_n + v0_off], %o5 + + addxccc(%o4, tmp, %g3) + ldx [loop_n + u0_off], %o4 + + xnor %g1, 0, %g1 + stx %g3, [loop_n + r0_off] + + addxccc(%g5, %g1, tmp) + ldx [loop_n + v1_off], %g1 + + ldx [loop_n + u1_off], %g5 + sub loop_n, -(2 * 8), loop_n + + brlz loop_n, L(top) + stx tmp, [loop_n + r1_off] + +L(loop_tail): + xnor %o5, 0, tmp + xnor %g1, 0, %g1 + + addxccc(%o4, tmp, %g3) + add loop_n, u0_off, up + + addxccc(%g5, %g1, %g5) + add loop_n, r0_off, rp + + stx %g3, [rp + 0] + add loop_n, v0_off, vp + + brgz,pt loop_n, L(done) + stx %g5, [rp + 8] + + add rp, (2 * 8), rp + +L(final_one): + ldx [up+0], %o4 + ldx [vp+0], %o5 + xnor %o5, %g0, %o5 + addxccc(%o4, %o5, %g3) + stx %g3, [rp+0] + +L(done): + clr %i0 + movcc %xcc, 1, %i0 + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm b/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm new file mode 100644 index 0000000..5635d1b --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct3/submul_1.asm @@ -0,0 +1,170 @@ +dnl SPARC v9 mpn_submul_1 for T3/T4/T5. + +dnl Contributed to the GNU project by David Miller and Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 26 +C UltraSPARC T4: 4.5 + +C INPUT PARAMETERS +define(`rp', `%i0') +define(`up', `%i1') +define(`n', `%i2') +define(`v0', `%i3') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_submul_1) + save %sp, -176, %sp + ldx [up+0], %g1 + + and n, 3, %g5 + add n, -4, n + brz %g5, L(b00) + cmp %g5, 2 + bcs %xcc, L(b01) + nop + bne %xcc, L(b11) + ldx [up+8], %g4 + +L(b10): add up, 16, up + addcc %g0, 0, %g3 + mulx %g1, v0, %l4 + umulxhi(%g1, v0, %l5) + ldx [rp+0], %o2 + mulx %g4, v0, %l6 + umulxhi(%g4, v0, %l7) + brlz n, L(wd2) + nop +L(gt2): ldx [up+0], %o0 + b L(lo2) + nop + +L(b00): add rp, -16, rp + addcc %g0, 0, %g3 + ldx [up+8], %o1 + mulx %g1, v0, %l0 + umulxhi(%g1, v0, %l1) + ldx [up+16], %o0 + ldx [rp+16], %o2 + mulx %o1, v0, %l2 + umulxhi(%o1, v0, %l3) + b L(lo0) + nop + +L(b01): add up, 8, up + add rp, -8, rp + addcc %g0, 0, %g3 + ldx [rp+8], %o3 + mulx %g1, v0, %l6 + umulxhi(%g1, v0, %l7) + brlz n, L(wd1) + nop + ldx [up+0], %o0 + ldx [up+8], %o1 + mulx %o0, v0, %l0 + umulxhi(%o0, v0, %l1) + b L(lo1) + nop + +L(b11): add up, 24, up + add rp, 8, rp + addcc %g0, 0, %g3 + mulx %g1, v0, %l2 + umulxhi(%g1, v0, %l3) + ldx [up-8], %o1 + ldx [rp-8], %o3 + mulx %g4, v0, %l4 + umulxhi(%g4, v0, %l5) + brlz n, L(end) + nop + + ALIGN(16) +L(top): ldx [up+0], %o0 + addxccc(%g3, %l2, %g1) + ldx [rp+0], %o2 + addxc( %g0, %l3, %g3) + mulx %o1, v0, %l6 + subcc %o3, %g1, %g4 + umulxhi(%o1, v0, %l7) + stx %g4, [rp-8] +L(lo2): ldx [up+8], %o1 + addxccc(%g3, %l4, %g1) + ldx [rp+8], %o3 + addxc( %g0, %l5, %g3) + mulx %o0, v0, %l0 + subcc %o2, %g1, %g4 + umulxhi(%o0, v0, %l1) + stx %g4, [rp+0] +L(lo1): ldx [up+16], %o0 + addxccc(%g3, %l6, %g1) + ldx [rp+16], %o2 + addxc( %g0, %l7, %g3) + mulx %o1, v0, %l2 + subcc %o3, %g1, %g4 + umulxhi(%o1, v0, %l3) + stx %g4, [rp+8] +L(lo0): ldx [up+24], %o1 + addxccc(%g3, %l0, %g1) + ldx [rp+24], %o3 + addxc( %g0, %l1, %g3) + mulx %o0, v0, %l4 + subcc %o2, %g1, %g4 + umulxhi(%o0, v0, %l5) + stx %g4, [rp+16] + add n, -4, n + add up, 32, up + brgez n, L(top) + add rp, 32, rp + +L(end): addxccc(%g3, %l2, %g1) + ldx [rp+0], %o2 + addxc( %g0, %l3, %g3) + mulx %o1, v0, %l6 + subcc %o3, %g1, %g4 + umulxhi(%o1, v0, %l7) + stx %g4, [rp-8] +L(wd2): addxccc(%g3, %l4, %g1) + ldx [rp+8], %o3 + addxc( %g0, %l5, %g3) + subcc %o2, %g1, %g4 + stx %g4, [rp+0] +L(wd1): addxccc(%g3, %l6, %g1) + addxc( %g0, %l7, %g3) + subcc %o3, %g1, %g4 + stx %g4, [rp+8] + addxc( %g0, %g3, %i0) + ret + restore +EPILOGUE() diff --git a/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h b/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h new file mode 100644 index 0000000..c10fd0d --- /dev/null +++ b/gmp-6.3.0/mpn/sparc64/ultrasparct45/gmp-mparam.h @@ -0,0 +1,174 @@ +/* Sparc64 T4-T5 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600 MHz ultrasparct5 running GNU/Linux */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-10-01, gcc 7.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 2 /* 0.34% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 +/* From gcc105.fsffrance.org, 2023-07-25 */ +#define DIV_QR_1N_PI1_METHOD 4 /* 7.06% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD 2 +#define DIV_QR_2_PI2_THRESHOLD 5 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define DIV_1_VS_MUL_1_PERCENT 654 + +#define MUL_TOOM22_THRESHOLD 40 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 372 +#define MUL_TOOM6H_THRESHOLD 494 +#define MUL_TOOM8H_THRESHOLD 656 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 126 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 247 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 225 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 219 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 188 + +#define SQR_BASECASE_THRESHOLD 20 +#define SQR_TOOM2_THRESHOLD 59 +#define SQR_TOOM3_THRESHOLD 107 +#define SQR_TOOM4_THRESHOLD 298 +#define SQR_TOOM6_THRESHOLD 399 +#define SQR_TOOM8_THRESHOLD 562 + +#define MULMID_TOOM42_THRESHOLD 48 + +#define MULMOD_BNM1_THRESHOLD 25 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 555 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 555, 5}, { 29, 6}, { 31, 7}, { 31, 8}, \ + { 17, 7}, { 36, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 8}, { 29, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 35, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 57,10}, { 15, 8}, { 61, 9}, { 31, 8}, \ + { 67, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \ + { 81, 9}, { 43,10}, { 23, 9}, { 59,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 87,10}, { 47, 9}, { 99,10}, { 55, 9}, \ + { 115,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 87,11}, { 47,10}, { 111, 9}, { 223,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 75 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 372 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 372, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \ + { 159,10}, { 319, 9}, { 639,12}, { 95,11}, \ + { 191,10}, { 383, 9}, { 767,11}, { 207,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 75 +#define SQR_FFT_THRESHOLD 3776 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 168 +#define SQRLO_SQR_THRESHOLD 7511 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 103 +#define DC_BDIV_QR_THRESHOLD 28 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 181 +#define INV_APPR_THRESHOLD 118 + +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1970 +#define MU_DIVAPPR_Q_THRESHOLD 1970 +#define MUPI_DIV_QR_THRESHOLD 82 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1970 + +#define POWM_SEC_TABLE 1,58,102,1509 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 686 +#define SET_STR_PRECOMPUTE_THRESHOLD 2717 + +#define FAC_DSC_THRESHOLD 336 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 32 +#define HGCD2_DIV1_METHOD 1 /* 0.66% faster than 3 */ +#define HGCD_THRESHOLD 57 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 288 +#define JACOBI_BASE_METHOD 4 /* 2.50% faster than 3 */ diff --git a/gmp-6.3.0/mpn/sqr.c b/gmp-6.3.0/mpn/sqr.c new file mode 120000 index 0000000..5264cae --- /dev/null +++ b/gmp-6.3.0/mpn/sqr.c @@ -0,0 +1 @@ +../mpn/generic/sqr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sqr_basecase.asm b/gmp-6.3.0/mpn/sqr_basecase.asm new file mode 120000 index 0000000..b1fd3d6 --- /dev/null +++ b/gmp-6.3.0/mpn/sqr_basecase.asm @@ -0,0 +1 @@ +../mpn/x86/p6/sse2/sqr_basecase.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sqrlo.c b/gmp-6.3.0/mpn/sqrlo.c new file mode 120000 index 0000000..fd18efe --- /dev/null +++ b/gmp-6.3.0/mpn/sqrlo.c @@ -0,0 +1 @@ +../mpn/generic/sqrlo.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sqrlo_basecase.c b/gmp-6.3.0/mpn/sqrlo_basecase.c new file mode 120000 index 0000000..39e518b --- /dev/null +++ b/gmp-6.3.0/mpn/sqrlo_basecase.c @@ -0,0 +1 @@ +../mpn/generic/sqrlo_basecase.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sqrmod_bnm1.c b/gmp-6.3.0/mpn/sqrmod_bnm1.c new file mode 120000 index 0000000..38e7110 --- /dev/null +++ b/gmp-6.3.0/mpn/sqrmod_bnm1.c @@ -0,0 +1 @@ +../mpn/generic/sqrmod_bnm1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sqrtrem.c b/gmp-6.3.0/mpn/sqrtrem.c new file mode 120000 index 0000000..64e7522 --- /dev/null +++ b/gmp-6.3.0/mpn/sqrtrem.c @@ -0,0 +1 @@ +../mpn/generic/sqrtrem.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/strongfibo.c b/gmp-6.3.0/mpn/strongfibo.c new file mode 120000 index 0000000..0186e69 --- /dev/null +++ b/gmp-6.3.0/mpn/strongfibo.c @@ -0,0 +1 @@ +../mpn/generic/strongfibo.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sub.c b/gmp-6.3.0/mpn/sub.c new file mode 120000 index 0000000..cf48184 --- /dev/null +++ b/gmp-6.3.0/mpn/sub.c @@ -0,0 +1 @@ +../mpn/generic/sub.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sub_1.c b/gmp-6.3.0/mpn/sub_1.c new file mode 120000 index 0000000..4efdb59 --- /dev/null +++ b/gmp-6.3.0/mpn/sub_1.c @@ -0,0 +1 @@ +../mpn/generic/sub_1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sub_err1_n.c b/gmp-6.3.0/mpn/sub_err1_n.c new file mode 120000 index 0000000..dde35ce --- /dev/null +++ b/gmp-6.3.0/mpn/sub_err1_n.c @@ -0,0 +1 @@ +../mpn/generic/sub_err1_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sub_err2_n.c b/gmp-6.3.0/mpn/sub_err2_n.c new file mode 120000 index 0000000..f59bfd6 --- /dev/null +++ b/gmp-6.3.0/mpn/sub_err2_n.c @@ -0,0 +1 @@ +../mpn/generic/sub_err2_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sub_err3_n.c b/gmp-6.3.0/mpn/sub_err3_n.c new file mode 120000 index 0000000..ea03b12 --- /dev/null +++ b/gmp-6.3.0/mpn/sub_err3_n.c @@ -0,0 +1 @@ +../mpn/generic/sub_err3_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/sub_n.asm b/gmp-6.3.0/mpn/sub_n.asm new file mode 120000 index 0000000..3f7fcac --- /dev/null +++ b/gmp-6.3.0/mpn/sub_n.asm @@ -0,0 +1 @@ +../mpn/x86/p6/aors_n.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/submul_1.asm b/gmp-6.3.0/mpn/submul_1.asm new file mode 120000 index 0000000..5d08935 --- /dev/null +++ b/gmp-6.3.0/mpn/submul_1.asm @@ -0,0 +1 @@ +../mpn/x86/p6/sse2/submul_1.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/tdiv_qr.c b/gmp-6.3.0/mpn/tdiv_qr.c new file mode 120000 index 0000000..a5b0b25 --- /dev/null +++ b/gmp-6.3.0/mpn/tdiv_qr.c @@ -0,0 +1 @@ +../mpn/generic/tdiv_qr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/thumb/add_n.asm b/gmp-6.3.0/mpn/thumb/add_n.asm new file mode 100644 index 0000000..08ed60b --- /dev/null +++ b/gmp-6.3.0/mpn/thumb/add_n.asm @@ -0,0 +1,63 @@ +dnl ARM/Thumb mpn_add_n. + +dnl Copyright 1997, 2000, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', r0) +define(`up', r1) +define(`vp', r2) +define(`n', r3) + +ASM_START() + .thumb +PROLOGUE(mpn_add_nc) + push {r4, r5, r6} + ldr r6, [sp, #12] C init carry save register + sub r6, #1 + b L(top) +EPILOGUE() +PROLOGUE(mpn_add_n) + push {r4, r5, r6} + neg r6, n C init carry save register + +L(top): ldmia up!, {r4} C load next limb from S1 + cmp n, r6 C tricky carry restore + ldmia vp!, {r5} C load next limb from S2 + adc r4, r5 + stmia rp!, {r4} C store result limb to RES + sbc r6, r6 C save negated carry + sub n, #1 + bne L(top) + + add r0, r6, #1 + pop {r4, r5, r6} + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/thumb/sub_n.asm b/gmp-6.3.0/mpn/thumb/sub_n.asm new file mode 100644 index 0000000..a385720 --- /dev/null +++ b/gmp-6.3.0/mpn/thumb/sub_n.asm @@ -0,0 +1,63 @@ +dnl ARM/Thumb mpn_sub_n. + +dnl Copyright 1997, 2000, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', r0) +define(`up', r1) +define(`vp', r2) +define(`n', r3) + +ASM_START() + .thumb +PROLOGUE(mpn_sub_nc) + push {r4, r5, r6} + ldr r6, [sp, #12] C init carry save register + neg r6, r6 + b L(top) +EPILOGUE() +PROLOGUE(mpn_sub_n) + push {r4, r5, r6} + mov r6, n C init carry save register + +L(top): ldmia up!, {r4} C load next limb from S1 + cmp n, r6 C tricky carry restore + ldmia vp!, {r5} C load next limb from S2 + sbc r4, r5 + stmia rp!, {r4} C store result limb to RES + sbc r6, r6 C save negated carry + sub n, #1 + bne L(top) + + neg r0, r6 + pop {r4, r5, r6} + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/toom22_mul.c b/gmp-6.3.0/mpn/toom22_mul.c new file mode 120000 index 0000000..d9611f3 --- /dev/null +++ b/gmp-6.3.0/mpn/toom22_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom22_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom2_sqr.c b/gmp-6.3.0/mpn/toom2_sqr.c new file mode 120000 index 0000000..2f4a719 --- /dev/null +++ b/gmp-6.3.0/mpn/toom2_sqr.c @@ -0,0 +1 @@ +../mpn/generic/toom2_sqr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom32_mul.c b/gmp-6.3.0/mpn/toom32_mul.c new file mode 120000 index 0000000..594557b --- /dev/null +++ b/gmp-6.3.0/mpn/toom32_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom32_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom33_mul.c b/gmp-6.3.0/mpn/toom33_mul.c new file mode 120000 index 0000000..35ca5cb --- /dev/null +++ b/gmp-6.3.0/mpn/toom33_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom33_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom3_sqr.c b/gmp-6.3.0/mpn/toom3_sqr.c new file mode 120000 index 0000000..7a2df45 --- /dev/null +++ b/gmp-6.3.0/mpn/toom3_sqr.c @@ -0,0 +1 @@ +../mpn/generic/toom3_sqr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom42_mul.c b/gmp-6.3.0/mpn/toom42_mul.c new file mode 120000 index 0000000..0e2821b --- /dev/null +++ b/gmp-6.3.0/mpn/toom42_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom42_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom42_mulmid.c b/gmp-6.3.0/mpn/toom42_mulmid.c new file mode 120000 index 0000000..d2e2640 --- /dev/null +++ b/gmp-6.3.0/mpn/toom42_mulmid.c @@ -0,0 +1 @@ +../mpn/generic/toom42_mulmid.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom43_mul.c b/gmp-6.3.0/mpn/toom43_mul.c new file mode 120000 index 0000000..73c999a --- /dev/null +++ b/gmp-6.3.0/mpn/toom43_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom43_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom44_mul.c b/gmp-6.3.0/mpn/toom44_mul.c new file mode 120000 index 0000000..a470f29 --- /dev/null +++ b/gmp-6.3.0/mpn/toom44_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom44_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom4_sqr.c b/gmp-6.3.0/mpn/toom4_sqr.c new file mode 120000 index 0000000..59eac76 --- /dev/null +++ b/gmp-6.3.0/mpn/toom4_sqr.c @@ -0,0 +1 @@ +../mpn/generic/toom4_sqr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom52_mul.c b/gmp-6.3.0/mpn/toom52_mul.c new file mode 120000 index 0000000..a162116 --- /dev/null +++ b/gmp-6.3.0/mpn/toom52_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom52_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom53_mul.c b/gmp-6.3.0/mpn/toom53_mul.c new file mode 120000 index 0000000..043bd74 --- /dev/null +++ b/gmp-6.3.0/mpn/toom53_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom53_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom54_mul.c b/gmp-6.3.0/mpn/toom54_mul.c new file mode 120000 index 0000000..3ccd1dc --- /dev/null +++ b/gmp-6.3.0/mpn/toom54_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom54_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom62_mul.c b/gmp-6.3.0/mpn/toom62_mul.c new file mode 120000 index 0000000..98039c1 --- /dev/null +++ b/gmp-6.3.0/mpn/toom62_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom62_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom63_mul.c b/gmp-6.3.0/mpn/toom63_mul.c new file mode 120000 index 0000000..f74dea1 --- /dev/null +++ b/gmp-6.3.0/mpn/toom63_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom63_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom6_sqr.c b/gmp-6.3.0/mpn/toom6_sqr.c new file mode 120000 index 0000000..f489350 --- /dev/null +++ b/gmp-6.3.0/mpn/toom6_sqr.c @@ -0,0 +1 @@ +../mpn/generic/toom6_sqr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom6h_mul.c b/gmp-6.3.0/mpn/toom6h_mul.c new file mode 120000 index 0000000..dca308e --- /dev/null +++ b/gmp-6.3.0/mpn/toom6h_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom6h_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom8_sqr.c b/gmp-6.3.0/mpn/toom8_sqr.c new file mode 120000 index 0000000..f519bae --- /dev/null +++ b/gmp-6.3.0/mpn/toom8_sqr.c @@ -0,0 +1 @@ +../mpn/generic/toom8_sqr.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom8h_mul.c b/gmp-6.3.0/mpn/toom8h_mul.c new file mode 120000 index 0000000..d40b0cd --- /dev/null +++ b/gmp-6.3.0/mpn/toom8h_mul.c @@ -0,0 +1 @@ +../mpn/generic/toom8h_mul.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_couple_handling.c b/gmp-6.3.0/mpn/toom_couple_handling.c new file mode 120000 index 0000000..2a4eaed --- /dev/null +++ b/gmp-6.3.0/mpn/toom_couple_handling.c @@ -0,0 +1 @@ +../mpn/generic/toom_couple_handling.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_eval_dgr3_pm1.c b/gmp-6.3.0/mpn/toom_eval_dgr3_pm1.c new file mode 120000 index 0000000..42819a3 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_eval_dgr3_pm1.c @@ -0,0 +1 @@ +../mpn/generic/toom_eval_dgr3_pm1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_eval_dgr3_pm2.c b/gmp-6.3.0/mpn/toom_eval_dgr3_pm2.c new file mode 120000 index 0000000..1decd34 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_eval_dgr3_pm2.c @@ -0,0 +1 @@ +../mpn/generic/toom_eval_dgr3_pm2.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_eval_pm1.c b/gmp-6.3.0/mpn/toom_eval_pm1.c new file mode 120000 index 0000000..0f34ff3 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_eval_pm1.c @@ -0,0 +1 @@ +../mpn/generic/toom_eval_pm1.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_eval_pm2.c b/gmp-6.3.0/mpn/toom_eval_pm2.c new file mode 120000 index 0000000..8b85d5b --- /dev/null +++ b/gmp-6.3.0/mpn/toom_eval_pm2.c @@ -0,0 +1 @@ +../mpn/generic/toom_eval_pm2.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_eval_pm2exp.c b/gmp-6.3.0/mpn/toom_eval_pm2exp.c new file mode 120000 index 0000000..c4814a9 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_eval_pm2exp.c @@ -0,0 +1 @@ +../mpn/generic/toom_eval_pm2exp.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_eval_pm2rexp.c b/gmp-6.3.0/mpn/toom_eval_pm2rexp.c new file mode 120000 index 0000000..d151e02 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_eval_pm2rexp.c @@ -0,0 +1 @@ +../mpn/generic/toom_eval_pm2rexp.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_interpolate_12pts.c b/gmp-6.3.0/mpn/toom_interpolate_12pts.c new file mode 120000 index 0000000..f5999b5 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_interpolate_12pts.c @@ -0,0 +1 @@ +../mpn/generic/toom_interpolate_12pts.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_interpolate_16pts.c b/gmp-6.3.0/mpn/toom_interpolate_16pts.c new file mode 120000 index 0000000..5743b1e --- /dev/null +++ b/gmp-6.3.0/mpn/toom_interpolate_16pts.c @@ -0,0 +1 @@ +../mpn/generic/toom_interpolate_16pts.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_interpolate_5pts.c b/gmp-6.3.0/mpn/toom_interpolate_5pts.c new file mode 120000 index 0000000..76bc0d1 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_interpolate_5pts.c @@ -0,0 +1 @@ +../mpn/generic/toom_interpolate_5pts.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_interpolate_6pts.c b/gmp-6.3.0/mpn/toom_interpolate_6pts.c new file mode 120000 index 0000000..cf8c877 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_interpolate_6pts.c @@ -0,0 +1 @@ +../mpn/generic/toom_interpolate_6pts.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_interpolate_7pts.c b/gmp-6.3.0/mpn/toom_interpolate_7pts.c new file mode 120000 index 0000000..c75a473 --- /dev/null +++ b/gmp-6.3.0/mpn/toom_interpolate_7pts.c @@ -0,0 +1 @@ +../mpn/generic/toom_interpolate_7pts.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/toom_interpolate_8pts.c b/gmp-6.3.0/mpn/toom_interpolate_8pts.c new file mode 120000 index 0000000..f23185b --- /dev/null +++ b/gmp-6.3.0/mpn/toom_interpolate_8pts.c @@ -0,0 +1 @@ +../mpn/generic/toom_interpolate_8pts.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/trialdiv.c b/gmp-6.3.0/mpn/trialdiv.c new file mode 120000 index 0000000..169e53a --- /dev/null +++ b/gmp-6.3.0/mpn/trialdiv.c @@ -0,0 +1 @@ +../mpn/generic/trialdiv.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/udiv.asm b/gmp-6.3.0/mpn/udiv.asm new file mode 120000 index 0000000..a4c3686 --- /dev/null +++ b/gmp-6.3.0/mpn/udiv.asm @@ -0,0 +1 @@ +../mpn/x86/udiv.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/umul.asm b/gmp-6.3.0/mpn/umul.asm new file mode 120000 index 0000000..f46a128 --- /dev/null +++ b/gmp-6.3.0/mpn/umul.asm @@ -0,0 +1 @@ +../mpn/x86/umul.asm \ No newline at end of file diff --git a/gmp-6.3.0/mpn/vax/add_n.asm b/gmp-6.3.0/mpn/vax/add_n.asm new file mode 100644 index 0000000..0a0bf78 --- /dev/null +++ b/gmp-6.3.0/mpn/vax/add_n.asm @@ -0,0 +1,64 @@ +dnl VAX mpn_add_n -- Add two limb vectors of the same length > 0 and store sum +dnl in a third limb vector. + +dnl Copyright 1999, 2000, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_add_n) + .word 0x0 + movl 16(ap), r0 + movl 12(ap), r1 + movl 8(ap), r2 + movl 4(ap), r3 + mnegl r0, r5 + addl2 $3, r0 + ashl $-2, r0, r0 C unroll loop count + bicl2 $-4, r5 C mask out low 2 bits + movaq (r5)[r5], r5 C 9x + jmp L(top)[r5] + +L(top): movl (r2)+, r4 + adwc (r1)+, r4 + movl r4, (r3)+ + movl (r2)+, r4 + adwc (r1)+, r4 + movl r4, (r3)+ + movl (r2)+, r4 + adwc (r1)+, r4 + movl r4, (r3)+ + movl (r2)+, r4 + adwc (r1)+, r4 + movl r4, (r3)+ + sobgtr r0, L(top) + + adwc r0, r0 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/vax/addmul_1.asm b/gmp-6.3.0/mpn/vax/addmul_1.asm new file mode 100644 index 0000000..8a6f636 --- /dev/null +++ b/gmp-6.3.0/mpn/vax/addmul_1.asm @@ -0,0 +1,124 @@ +dnl VAX mpn_addmul_1 -- Multiply a limb vector with a limb and add the result +dnl to a second limb vector. + +dnl Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_addmul_1) + .word 0xfc0 + movl 12(ap), r4 + movl 8(ap), r8 + movl 4(ap), r9 + clrl r3 + incl r4 + ashl $-1, r4, r7 + clrl r11 + movl 16(ap), r6 + jlss L(v0_big) + jlbc r4, L(1) + +C Loop for v0 < 0x80000000 +L(tp1): movl (r8)+, r1 + jlss L(1n0) + emul r1, r6, $0, r2 + addl2 r11, r2 + adwc $0, r3 + addl2 r2, (r9)+ + adwc $0, r3 +L(1): movl (r8)+, r1 + jlss L(1n1) +L(1p1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc $0, r11 + addl2 r10, (r9)+ + adwc $0, r11 + + sobgtr r7, L(tp1) + movl r11, r0 + ret + +L(1n0): emul r1, r6, $0, r2 + addl2 r11, r2 + adwc r6, r3 + addl2 r2, (r9)+ + adwc $0, r3 + movl (r8)+, r1 + jgeq L(1p1) +L(1n1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc r6, r11 + addl2 r10, (r9)+ + adwc $0, r11 + + sobgtr r7, L(tp1) + movl r11, r0 + ret + +L(v0_big): + jlbc r4, L(2) + +C Loop for v0 >= 0x80000000 +L(tp2): movl (r8)+, r1 + jlss L(2n0) + emul r1, r6, $0, r2 + addl2 r11, r2 + adwc r1, r3 + addl2 r2, (r9)+ + adwc $0, r3 +L(2): movl (r8)+, r1 + jlss L(2n1) +L(2p1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc r1, r11 + addl2 r10, (r9)+ + adwc $0, r11 + + sobgtr r7, L(tp2) + movl r11, r0 + ret + +L(2n0): emul r1, r6, $0, r2 + addl2 r11, r2 + adwc r6, r3 + addl2 r2, (r9)+ + adwc r1, r3 + movl (r8)+, r1 + jgeq L(2p1) +L(2n1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc r6, r11 + addl2 r10, (r9)+ + adwc r1, r11 + + sobgtr r7, L(tp2) + movl r11, r0 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/vax/elf.m4 b/gmp-6.3.0/mpn/vax/elf.m4 new file mode 100644 index 0000000..e04f0ba --- /dev/null +++ b/gmp-6.3.0/mpn/vax/elf.m4 @@ -0,0 +1,54 @@ +divert(-1) + +dnl m4 macros for VAX assembler. + +dnl Copyright 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +defreg(r0,`%r``''0') +defreg(r1,`%r``''1') +defreg(r2,`%r``''2') +defreg(r3,`%r``''3') +defreg(r4,`%r``''4') +defreg(r5,`%r``''5') +defreg(r6,`%r``''6') +defreg(r7,`%r``''7') +defreg(r8,`%r``''8') +defreg(r9,`%r``''9') +defreg(r10,`%r``''10') +defreg(r11,`%r``''11') +defreg(r12,`%r``''12') +defreg(r13,`%r``''13') +defreg(r14,`%r``''14') +defreg(r15,`%r``''15') +defreg(ap,`%a``''p') + +define(`foo', blablabla) + +divert diff --git a/gmp-6.3.0/mpn/vax/gmp-mparam.h b/gmp-6.3.0/mpn/vax/gmp-mparam.h new file mode 100644 index 0000000..9f20b9b --- /dev/null +++ b/gmp-6.3.0/mpn/vax/gmp-mparam.h @@ -0,0 +1,60 @@ +/* VAX gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* These numbers were measured manually using the tune/speed program. + The standard tune/tuneup takes too long. (VAX 8800) */ + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 110 + +#define SQR_BASECASE_THRESHOLD 6 +#define SQR_TOOM2_THRESHOLD 42 +#define SQR_TOOM3_THRESHOLD 250 + +/* #define DIV_SB_PREINV_THRESHOLD */ +/* #define DIV_DC_THRESHOLD */ +/* #define POWM_THRESHOLD */ + +/* #define GCD_ACCEL_THRESHOLD */ +/* #define JACOBI_BASE_METHOD */ + +/* #define DIVREM_1_NORM_THRESHOLD */ +/* #define DIVREM_1_UNNORM_THRESHOLD */ +/* #define MOD_1_NORM_THRESHOLD */ +/* #define MOD_1_UNNORM_THRESHOLD */ +/* #define USE_PREINV_DIVREM_1 */ +/* #define USE_PREINV_MOD_1 */ +/* #define DIVREM_2_THRESHOLD */ +/* #define DIVEXACT_1_THRESHOLD */ +/* #define MODEXACT_1_ODD_THRESHOLD */ + +/* #define GET_STR_DC_THRESHOLD */ +/* #define GET_STR_PRECOMPUTE_THRESHOLD */ +#define SET_STR_THRESHOLD 3400 diff --git a/gmp-6.3.0/mpn/vax/lshift.asm b/gmp-6.3.0/mpn/vax/lshift.asm new file mode 100644 index 0000000..941e999 --- /dev/null +++ b/gmp-6.3.0/mpn/vax/lshift.asm @@ -0,0 +1,59 @@ +dnl VAX mpn_lshift -- left shift. + +dnl Copyright 1999-2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_lshift) + .word 0x1c0 + movl 4(ap), r7 + movl 8(ap), r6 + movl 12(ap), r1 + movl 16(ap), r8 + + moval (r6)[r1], r6 + moval (r7)[r1], r7 + clrl r3 + movl -(r6), r2 + ashq r8, r2, r4 + movl r5, r0 + movl r2, r3 + decl r1 + jeql L(end) + +L(top): movl -(r6), r2 + ashq r8, r2, r4 + movl r5, -(r7) + movl r2, r3 + sobgtr r1, L(top) + +L(end): movl r4, -4(r7) + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/vax/mul_1.asm b/gmp-6.3.0/mpn/vax/mul_1.asm new file mode 100644 index 0000000..8e4dcd2 --- /dev/null +++ b/gmp-6.3.0/mpn/vax/mul_1.asm @@ -0,0 +1,118 @@ +dnl VAX mpn_mul_1 -- Multiply a limb vector with a limb and store the result +dnl in a second limb vector. + +dnl Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_mul_1) + .word 0xfc0 + movl 12(ap), r4 + movl 8(ap), r8 + movl 4(ap), r9 + clrl r3 + incl r4 + ashl $-1, r4, r7 + clrl r11 + movl 16(ap), r6 + jlss L(v0_big) + jlbc r4, L(1) + +C Loop for v0 < 0x80000000 +L(tp1): movl (r8)+, r1 + jlss L(1n0) + emul r1, r6, $0, r2 + addl2 r11, r2 + adwc $0, r3 + movl r2, (r9)+ +L(1): movl (r8)+, r1 + jlss L(1n1) +L(1p1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc $0, r11 + movl r10, (r9)+ + + sobgtr r7, L(tp1) + movl r11, r0 + ret + +L(1n0): emul r1, r6, $0, r2 + addl2 r11, r2 + adwc r6, r3 + movl r2, (r9)+ + movl (r8)+, r1 + jgeq L(1p1) +L(1n1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc r6, r11 + movl r10, (r9)+ + + sobgtr r7, L(tp1) + movl r11, r0 + ret + +L(v0_big): + jlbc r4, L(2) + +C Loop for v0 >= 0x80000000 +L(tp2): movl (r8)+, r1 + jlss L(2n0) + emul r1, r6, $0, r2 + addl2 r11, r2 + adwc r1, r3 + movl r2, (r9)+ +L(2): movl (r8)+, r1 + jlss L(2n1) +L(2p1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc r1, r11 + movl r10, (r9)+ + + sobgtr r7, L(tp2) + movl r11, r0 + ret + +L(2n0): emul r1, r6, $0, r2 + addl2 r1, r3 + addl2 r11, r2 + adwc r6, r3 + movl r2, (r9)+ + movl (r8)+, r1 + jgeq L(2p1) +L(2n1): emul r1, r6, $0, r10 + addl2 r1, r11 + addl2 r3, r10 + adwc r6, r11 + movl r10, (r9)+ + + sobgtr r7, L(tp2) + movl r11, r0 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/vax/rshift.asm b/gmp-6.3.0/mpn/vax/rshift.asm new file mode 100644 index 0000000..00b2daa --- /dev/null +++ b/gmp-6.3.0/mpn/vax/rshift.asm @@ -0,0 +1,57 @@ +dnl VAX mpn_rshift -- right shift. + +dnl Copyright 1999-2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_rshift) + .word 0x1c0 + movl 4(ap), r7 + movl 8(ap), r6 + movl 12(ap), r1 + movl 16(ap), r8 + + movl (r6)+, r2 + subl3 r8, $32, r8 + ashl r8, r2, r0 + decl r1 + jeql L(end) + +L(top): movl (r6)+, r3 + ashq r8, r2, r4 + movl r5, (r7)+ + movl r3, r2 + sobgtr r1, L(top) + +L(end): clrl r3 + ashq r8, r2, r4 + movl r5, (r7) + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/vax/sub_n.asm b/gmp-6.3.0/mpn/vax/sub_n.asm new file mode 100644 index 0000000..2844ef2 --- /dev/null +++ b/gmp-6.3.0/mpn/vax/sub_n.asm @@ -0,0 +1,64 @@ +dnl VAX mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright 1999, 2000, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sub_n) + .word 0x0 + movl 16(ap), r0 + movl 12(ap), r1 + movl 8(ap), r2 + movl 4(ap), r3 + mnegl r0, r5 + addl2 $3, r0 + ashl $-2, r0, r0 C unroll loop count + bicl2 $-4, r5 C mask out low 2 bits + movaq (r5)[r5], r5 C 9x + jmp L(top)[r5] + +L(top): movl (r2)+, r4 + sbwc (r1)+, r4 + movl r4, (r3)+ + movl (r2)+, r4 + sbwc (r1)+, r4 + movl r4, (r3)+ + movl (r2)+, r4 + sbwc (r1)+, r4 + movl r4, (r3)+ + movl (r2)+, r4 + sbwc (r1)+, r4 + movl r4, (r3)+ + sobgtr r0, L(top) + + adwc r0, r0 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/vax/submul_1.asm b/gmp-6.3.0/mpn/vax/submul_1.asm new file mode 100644 index 0000000..60d47fc --- /dev/null +++ b/gmp-6.3.0/mpn/vax/submul_1.asm @@ -0,0 +1,124 @@ +dnl VAX mpn_submul_1 -- Multiply a limb vector with a limb and subtract the +dnl result from a second limb vector. + +dnl Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_submul_1) + .word 0xfc0 + movl 12(ap), r4 + movl 8(ap), r8 + movl 4(ap), r9 + clrl r3 + incl r4 + ashl $-1, r4, r7 + clrl r11 + movl 16(ap), r6 + jlss L(v0_big) + jlbc r4, L(1) + +C Loop for v0 < 0x80000000 +L(tp1): movl (r8)+, r1 + jlss L(1n0) + emul r1, r6, $0, r2 + addl2 r11, r2 + adwc $0, r3 + subl2 r2, (r9)+ + adwc $0, r3 +L(1): movl (r8)+, r1 + jlss L(1n1) +L(1p1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc $0, r11 + subl2 r10, (r9)+ + adwc $0, r11 + + sobgtr r7, L(tp1) + movl r11, r0 + ret + +L(1n0): emul r1, r6, $0, r2 + addl2 r11, r2 + adwc r6, r3 + subl2 r2, (r9)+ + adwc $0, r3 + movl (r8)+, r1 + jgeq L(1p1) +L(1n1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc r6, r11 + subl2 r10, (r9)+ + adwc $0, r11 + + sobgtr r7, L(tp1) + movl r11, r0 + ret + +L(v0_big): + jlbc r4, L(2) + +C Loop for v0 >= 0x80000000 +L(tp2): movl (r8)+, r1 + jlss L(2n0) + emul r1, r6, $0, r2 + addl2 r11, r2 + adwc r1, r3 + subl2 r2, (r9)+ + adwc $0, r3 +L(2): movl (r8)+, r1 + jlss L(2n1) +L(2p1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc r1, r11 + subl2 r10, (r9)+ + adwc $0, r11 + + sobgtr r7, L(tp2) + movl r11, r0 + ret + +L(2n0): emul r1, r6, $0, r2 + addl2 r11, r2 + adwc r6, r3 + subl2 r2, (r9)+ + adwc r1, r3 + movl (r8)+, r1 + jgeq L(2p1) +L(2n1): emul r1, r6, $0, r10 + addl2 r3, r10 + adwc r6, r11 + subl2 r10, (r9)+ + adwc r1, r11 + + sobgtr r7, L(tp2) + movl r11, r0 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/README b/gmp-6.3.0/mpn/x86/README new file mode 100644 index 0000000..8d7ac90 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/README @@ -0,0 +1,525 @@ +Copyright 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + X86 MPN SUBROUTINES + + +This directory contains mpn functions for various 80x86 chips. + + +CODE ORGANIZATION + + x86 i386, generic + x86/i486 i486 + x86/pentium Intel Pentium (P5, P54) + x86/pentium/mmx Intel Pentium with MMX (P55) + x86/p6 Intel Pentium Pro + x86/p6/mmx Intel Pentium II, III + x86/p6/p3mmx Intel Pentium III + x86/k6 \ AMD K6 + x86/k6/mmx / + x86/k6/k62mmx AMD K6-2 + x86/k7 \ AMD Athlon + x86/k7/mmx / + x86/pentium4 \ + x86/pentium4/mmx | Intel Pentium 4 + x86/pentium4/sse2 / + + +The top-level x86 directory contains blended style code, meant to be +reasonable on all x86s. + + + +STATUS + +The code is well-optimized for AMD and Intel chips, but there's nothing +specific for Cyrix chips, nor for actual 80386 and 80486 chips. + + + +ASM FILES + +The x86 .asm files are BSD style assembler code, first put through m4 for +macro processing. The generic mpn/asm-defs.m4 is used, together with +mpn/x86/x86-defs.m4. See comments in those files. + +The code is meant for use with GNU "gas" or a system "as". There's no +support for assemblers that demand Intel style code. + + + +STACK FRAME + +m4 macros are used to define the parameters passed on the stack, and these +act like comments on what the stack frame looks like too. For example, +mpn_mul_1() has the following. + + defframe(PARAM_MULTIPLIER, 16) + defframe(PARAM_SIZE, 12) + defframe(PARAM_SRC, 8) + defframe(PARAM_DST, 4) + +PARAM_MULTIPLIER becomes `FRAME+16(%esp)', and the others similarly. The +return address is at offset 0, but there's not normally any need to access +that. + +FRAME is redefined as necessary through the code so it's the number of bytes +pushed on the stack, and hence the offsets in the parameter macros stay +correct. At the start of a routine FRAME should be zero. + + deflit(`FRAME',0) + ... + deflit(`FRAME',4) + ... + deflit(`FRAME',8) + ... + +Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and +FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions, +and can be used instead of explicit definitions if preferred. +defframe_pushl() is a combination FRAME_pushl() and defframe(). + +There's generally some slackness in redefining FRAME. If new values aren't +going to get used then the redefinitions are omitted to keep from cluttering +up the code. This happens for instance at the end of a routine, where there +might be just four pops and then a ret, so FRAME isn't getting used. + +Local variables and saved registers can be similarly defined, with negative +offsets representing stack space below the initial stack pointer. For +example, + + defframe(SAVE_ESI, -4) + defframe(SAVE_EDI, -8) + defframe(VAR_COUNTER,-12) + + deflit(STACK_SPACE, 12) + +Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the +space, and that instruction must be followed by a redefinition of FRAME +(setting it equal to STACK_SPACE) to reflect the change in %esp. + +Definitions for pushed registers are only put in when they're going to be +used. If registers are just saved and restored with pushes and pops then +definitions aren't made. + + + +ASSEMBLER EXPRESSIONS + +Only addition and subtraction seem to be universally available, certainly +that's all the Solaris 8 "as" seems to accept. If expressions are wanted +then m4 eval() should be used. + +In particular note that a "/" anywhere in a line starts a comment in Solaris +"as", and in some configurations of gas too. + + addl $32/2, %eax <-- wrong + + addl $eval(32/2), %eax <-- right + +Binutils gas/config/tc-i386.c has a choice between "/" being a comment +anywhere in a line, or only at the start. FreeBSD patches 2.9.1 to select +the latter, and from 2.9.5 it's the default for GNU/Linux too. + + + +ASSEMBLER COMMENTS + +Solaris "as" doesn't support "#" commenting, using /* */ instead. For that +reason "C" commenting is used (see asm-defs.m4) and the intermediate ".s" +files have no comments. + +Any comments before include(`../config.m4') must use m4 "dnl", since it's +only after the include that "C" is available. By convention "dnl" is also +used for comments about m4 macros. + + + +TEMPORARY LABELS + +Temporary numbered labels like "1:" used as "1f" or "1b" are available in +"gas" and Solaris "as", but not in SCO "as". Normal L() labels should be +used instead, possibly with a counter to make them unique, see jadcl0() in +x86-defs.m4 for instance. A separate counter for each macro makes it +possible to nest them, for instance movl_text_address() can be used within +an ASSERT(). + +"1:" etc must be avoided in gcc __asm__ blocks too. "%=" for generating a +unique number looks like a good alternative, but is that actually a +documented feature? In any case this problem doesn't currently arise. + + + +ZERO DISPLACEMENTS + +In a couple of places addressing modes like 0(%ebx) with a byte-sized zero +displacement are wanted, rather than (%ebx) with no displacement. These are +either for computed jumps or to get desirable code alignment. Explicit +.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into +(%ebx). The Zdisp() macro in x86-defs.m4 is used for this. + +Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas +1.92.3 changes it. In general changing would be the sort of "optimization" +an assembler might perform, hence explicit ".byte"s are used where +necessary. + + + +SHLD/SHRD INSTRUCTIONS + +The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx" +must be written "shldl %eax,%ebx" for some assemblers. gas takes either, +Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is +gas), and omits %cl elsewhere. + +For GMP an autoconf test GMP_ASM_X86_SHLDL_CL is used to determine whether +%cl should be used, and the macros shldl, shrdl, shldw and shrdw in +mpn/x86/x86-defs.m4 pass through or omit %cl as necessary. See the comments +with those macros for usage. + + + +IMUL INSTRUCTION + +GCC config/i386/i386.md (cvs rev 1.187, 21 Oct 00) under *mulsi3_1 notes +that the following two forms produce identical object code + + imul $12, %eax + imul $12, %eax, %eax + +but that the former isn't accepted by some assemblers, in particular the SCO +OSR5 COFF assembler. GMP follows GCC and uses only the latter form. + +(This applies only to immediate operands, the three operand form is only +valid with an immediate.) + + + +DIRECTION FLAG + +The x86 calling conventions say that the direction flag should be clear at +function entry and exit. (See iBCS2 and SVR4 ABI books, references below.) +Although this has been so since the year dot, it's not absolutely clear +whether it's universally respected. Since it's better to be safe than +sorry, GMP follows glibc and does a "cld" if it depends on the direction +flag being clear. This happens only in a few places. + + + +POSITION INDEPENDENT CODE + + Coding Style + + Defining the symbol PIC in m4 processing selects SVR4 / ELF style + position independent code. This is necessary for shared libraries + because they can be mapped into different processes at different virtual + addresses. Actually, relocations are allowed but text pages with + relocations aren't shared, defeating the purpose of a shared library. + + The GOT is used to access global data, and the PLT is used for + functions. The use of the PLT adds a fixed cost to every function call, + and the GOT adds a cost to any function accessing global variables. + These are small but might be noticeable when working with small + operands. + + Scope + + It's intended, as a matter of policy, that references within libgmp are + resolved within libgmp. Certainly there's no need for an application to + replace any internals, and we take the view that there's no value in an + application subverting anything documented either. + + Resolving references within libgmp in theory means calls can be made with a + plain PC-relative call instruction, which is faster and smaller than going + through the PLT, and data references can be similarly PC-relative, saving a + GOT entry and fetch from there. Unfortunately the normal linker behaviour + doesn't allow us to do this. + + By default an R_386_PC32 PC-relative reference, either for a call or for + data, is left in libgmp.so by the linker so that it can be resolved at + runtime to a location in the application or another shared library. This + means a text segment relocation which we don't want. + + -Bsymbolic + + Under the "-Bsymbolic" option, the linker resolves references to symbols + within libgmp.so. This gives us the desired effect for R_386_PC32, + ie. it's resolved at link time. It also resolves R_386_PLT32 calls + directly to their target without creating a PLT entry (though if this is + done to normal compiler-generated code it still leaves a setup of %ebx + to _GLOBAL_OFFSET_TABLE_ which may then be unnecessary). + + Unfortunately -Bsymbolic does bad things to global variables defined in + a shared library but accessed by non-PIC code from the mainline (or a + static library). + + The problem is that the mainline needs a fixed data address to avoid + text segment relocations, so space is allocated in its data segment and + the value from the variable is copied from the shared library's data + segment when the library is loaded. Under -Bsymbolic, however, + references in the shared library are then resolved still to the shared + library data area. Not surprisingly it bombs badly to have mainline + code and library code accessing different locations for what should be + one variable. + + Note that this -Bsymbolic effect for the shared library is not just for + R_386_PC32 offsets which might have been cooked up in assembler, but is + done also for the contents of GOT entries. -Bsymbolic simply applies a + general rule that symbols are resolved first from the local module. + + Visibility Attributes + + GCC __attribute__ ((visibility ("protected"))), which is available in + recent versions, eg. 3.3, is probably what we'd like to use. It makes + gcc generate plain PC-relative calls to indicated functions, and directs + the linker to resolve references to the given function within the link + module. + + Unfortunately, as of debian binutils 2.13.90.0.16 at least, the + resulting libgmp.so comes out with text segment relocations, references + are not resolved at link time. If the gcc description is to be believed + this is this not how it should work. If a symbol cannot be overridden + by another module then surely references within that module can be + resolved immediately (ie. at link time). + + Present + + In any case, all this means that we have no optimizations we can + usefully make to function or variable usages, neither for assembler nor + C code. Perhaps in the future the visibility attribute will work as + we'd like. + + + + +GLOBAL OFFSET TABLE + +The magic _GLOBAL_OFFSET_TABLE_ used by code establishing the address of the +GOT sometimes requires an extra underscore prefix. SVR4 systems and NetBSD +don't need a prefix, OpenBSD does need one. Note that NetBSD and OpenBSD +are both a.out underscore systems, so the prefix for _GLOBAL_OFFSET_TABLE_ +is not simply the same as the prefix for ordinary globals. + +In any case in the asm code we write _GLOBAL_OFFSET_TABLE_ and let a macro +in x86-defs.m4 add an extra underscore if required (according to a configure +test). + +Old gas 1.92.3 which comes with FreeBSD 2.2.8 gets a segmentation fault when +asked to assemble the following, + + L1: + addl $_GLOBAL_OFFSET_TABLE_+[.-L1], %ebx + +It seems that using the label in the same instruction it refers to is the +problem, since a nop in between works. But the simplest workaround is to +follow gcc and omit the +[.-L1] since it does nothing, + + addl $_GLOBAL_OFFSET_TABLE_, %ebx + +Current gas 2.10 generates incorrect object code when %eax is used in such a +construction (with or without +[.-L1]), + + addl $_GLOBAL_OFFSET_TABLE_, %eax + +The R_386_GOTPC gets a displacement of 2 rather than the 1 appropriate for +the 1 byte opcode of "addl $n,%eax". The best workaround is just to use any +other register, since then it's a two byte opcode+mod/rm. GCC for example +always uses %ebx (which is needed for calls through the PLT). + +A similar problem occurs in an leal (again with or without a +[.-L1]), + + leal _GLOBAL_OFFSET_TABLE_(%edi), %ebx + +This time the R_386_GOTPC gets a displacement of 0 rather than the 2 +appropriate for the opcode and mod/rm, making this form unusable. + + + + +SIMPLE LOOPS + +The overheads in setting up for an unrolled loop can mean that at small +sizes a simple loop is faster. Making small sizes go fast is important, +even if it adds a cycle or two to bigger sizes. To this end various +routines choose between a simple loop and an unrolled loop according to +operand size. The path to the simple loop, or to special case code for +small sizes, is always as fast as possible. + +Adding a simple loop requires a conditional jump to choose between the +simple and unrolled code. The size of a branch misprediction penalty +affects whether a simple loop is worthwhile. + +The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover +point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >= +UNROLL_THRESHOLD using the unrolled loop. If position independent code adds +a couple of cycles to an unrolled loop setup, the threshold will vary with +PIC or non-PIC. Something like the following is typical. + + deflit(UNROLL_THRESHOLD, ifdef(`PIC',10,8)) + +There's no automated way to determine the threshold. Setting it to a small +value and then to a big value makes it possible to measure the simple and +unrolled loops each over a range of sizes, from which the crossover point +can be determined. Alternately, just adjust the threshold up or down until +there's no more speedups. + + + +UNROLLED LOOP CODING + +The x86 addressing modes allow a byte displacement of -128 to +127, making +it possible to access 256 bytes, which is 64 limbs, without adjusting +pointer registers within the loop. Dword sized displacements can be used +too, but they increase code size, and unrolling to 64 ought to be enough. + +When unrolling to the full 64 limbs/loop, the limb at the top of the loop +will have a displacement of -128, so pointers have to have a corresponding ++128 added before entering the loop. When unrolling to 32 limbs/loop +displacements 0 to 127 can be used with 0 at the top of the loop and no +adjustment needed to the pointers. + +Where 64 limbs/loop is supported, the +128 adjustment is done only when 64 +limbs/loop is selected. Usually the gain in speed using 64 instead of 32 or +16 is small, so support for 64 limbs/loop is generally only for comparison. + + + +COMPUTED JUMPS + +When working from least significant limb to most significant limb (most +routines) the computed jump and pointer calculations in preparation for an +unrolled loop are as follows. + + S = operand size in limbs + N = number of limbs per loop (UNROLL_COUNT) + L = log2 of unrolling (UNROLL_LOG2) + M = mask for unrolling (UNROLL_MASK) + C = code bytes per limb in the loop + B = bytes per limb (4 for x86) + + computed jump (-S & M) * C + entrypoint + subtract from pointers (-S & M) * B + initial loop counter (S-1) >> L + displacements 0 to B*(N-1) + +The loop counter is decremented at the end of each loop, and the looping +stops when the decrement takes the counter to -1. The displacements are for +the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax". + +Usually the multiply by "C" can be handled without an imul, using instead an +leal, or a shift and subtract. + +When working from most significant to least significant limb (eg. mpn_lshift +and mpn_copyd), the calculations change as follows. + + add to pointers (-S & M) * B + displacements 0 to -B*(N-1) + + + +OLD GAS 1.92.3 + +This version comes with FreeBSD 2.2.8 and has a couple of gremlins that +affect GMP code. + +Firstly, an expression involving two forward references to labels comes out +as zero. For example, + + addl $bar-foo, %eax + foo: + nop + bar: + +This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax". +When only one forward reference is involved, it works correctly, as for +example, + + foo: + addl $bar-foo, %eax + nop + bar: + +Secondly, an expression involving two labels can't be used as the +displacement for an leal. For example, + + foo: + nop + bar: + leal bar-foo(%eax,%ebx,8), %ecx + +A slightly cryptic error is given, "Unimplemented segment type 0 in +parse_operand". When only one label is used it's ok, and the label can be a +forward reference too, as for example, + + leal foo(%eax,%ebx,8), %ecx + nop + foo: + +These problems only affect PIC computed jump calculations. The workarounds +are just to do an leal without a displacement and then an addl, and to make +sure the code is placed so that there's at most one forward reference in the +addl. + + + +REFERENCES + +"Intel Architecture Software Developer's Manual", volumes 1, 2a, 2b, 3a, 3b, +2006, order numbers 253665 through 253669. Available on-line, + + ftp://download.intel.com/design/Pentium4/manuals/25366518.pdf + ftp://download.intel.com/design/Pentium4/manuals/25366618.pdf + ftp://download.intel.com/design/Pentium4/manuals/25366718.pdf + ftp://download.intel.com/design/Pentium4/manuals/25366818.pdf + ftp://download.intel.com/design/Pentium4/manuals/25366918.pdf + + +"System V Application Binary Interface", Unix System Laboratories Inc, 1992, +published by Prentice Hall, ISBN 0-13-880410-9. And the "Intel386 Processor +Supplement", AT&T, 1991, ISBN 0-13-877689-X. These have details of calling +conventions and ELF shared library PIC coding. Versions of both available +on-line, + + http://www.sco.com/developer/devspecs + +"Intel386 Family Binary Compatibility Specification 2", Intel Corporation, +published by McGraw-Hill, 1991, ISBN 0-07-031219-2. (Same as the above 386 +ABI supplement.) + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/x86/aors_n.asm b/gmp-6.3.0/mpn/x86/aors_n.asm new file mode 100644 index 0000000..5d359f5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/aors_n.asm @@ -0,0 +1,202 @@ +dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1992, 1994-1996, 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 3.375 +C P6 3.125 +C K6 3.5 +C K7 2.25 +C P4 8.75 + + +ifdef(`OPERATION_add_n',` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) + +PROLOGUE(M4_function_nc) +deflit(`FRAME',0) + + pushl %edi FRAME_pushl() + pushl %esi FRAME_pushl() + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%edx + movl PARAM_SIZE,%ecx + + movl %ecx,%eax + shrl $3,%ecx C compute count for unrolled loop + negl %eax + andl $7,%eax C get index where to start loop + jz L(oopgo) C necessary special case for 0 + incl %ecx C adjust loop count + shll $2,%eax C adjustment for pointers... + subl %eax,%edi C ... since they are offset ... + subl %eax,%esi C ... by a constant when we ... + subl %eax,%edx C ... enter the loop + shrl $2,%eax C restore previous value + +ifdef(`PIC',` + C Calculate start address in loop for PIC. Due to limitations in + C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal + call L(0a) +L(0a): leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $L(oop)-L(0a)-3,%eax + addl $4,%esp +',` + C Calculate start address in loop for non-PIC. + leal L(oop)-3(%eax,%eax,8),%eax +') + + C These lines initialize carry from the 5th parameter. Should be + C possible to simplify. + pushl %ebp FRAME_pushl() + movl PARAM_CARRY,%ebp + shrl %ebp C shift bit 0 into carry + popl %ebp FRAME_popl() + + jmp *%eax C jump into loop + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(M4_function_n) +deflit(`FRAME',0) + + pushl %edi FRAME_pushl() + pushl %esi FRAME_pushl() + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%edx + movl PARAM_SIZE,%ecx + + movl %ecx,%eax + shrl $3,%ecx C compute count for unrolled loop + negl %eax + andl $7,%eax C get index where to start loop + jz L(oop) C necessary special case for 0 + incl %ecx C adjust loop count + shll $2,%eax C adjustment for pointers... + subl %eax,%edi C ... since they are offset ... + subl %eax,%esi C ... by a constant when we ... + subl %eax,%edx C ... enter the loop + shrl $2,%eax C restore previous value + +ifdef(`PIC',` + C Calculate start address in loop for PIC. Due to limitations in + C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal + call L(0b) +L(0b): leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $L(oop)-L(0b)-3,%eax + addl $4,%esp +',` + C Calculate start address in loop for non-PIC. + leal L(oop)-3(%eax,%eax,8),%eax +') + jmp *%eax C jump into loop + +L(oopgo): + pushl %ebp FRAME_pushl() + movl PARAM_CARRY,%ebp + shrl %ebp C shift bit 0 into carry + popl %ebp FRAME_popl() + + ALIGN(16) +L(oop): movl (%esi),%eax + M4_inst (%edx),%eax + movl %eax,(%edi) + movl 4(%esi),%eax + M4_inst 4(%edx),%eax + movl %eax,4(%edi) + movl 8(%esi),%eax + M4_inst 8(%edx),%eax + movl %eax,8(%edi) + movl 12(%esi),%eax + M4_inst 12(%edx),%eax + movl %eax,12(%edi) + movl 16(%esi),%eax + M4_inst 16(%edx),%eax + movl %eax,16(%edi) + movl 20(%esi),%eax + M4_inst 20(%edx),%eax + movl %eax,20(%edi) + movl 24(%esi),%eax + M4_inst 24(%edx),%eax + movl %eax,24(%edi) + movl 28(%esi),%eax + M4_inst 28(%edx),%eax + movl %eax,28(%edi) + leal 32(%edi),%edi + leal 32(%esi),%esi + leal 32(%edx),%edx + decl %ecx + jnz L(oop) + + sbbl %eax,%eax + negl %eax + + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/aorsmul_1.asm b/gmp-6.3.0/mpn/x86/aorsmul_1.asm new file mode 100644 index 0000000..8c13997 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/aorsmul_1.asm @@ -0,0 +1,214 @@ +dnl x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a +dnl limb and add the result to a second limb vector. + +dnl Copyright 1992, 1994, 1997, 1999-2002, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom +C AMD K6 +C AMD K7 +C AMD K8 3.875 +C AMD K10 + + +ifdef(`OPERATION_addmul_1',` + define(ADDSUB, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + +',`ifdef(`OPERATION_submul_1',` + define(ADDSUB, subl) + define(M4_function_1, mpn_submul_1) + +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1 mpn_addmul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); + +define(PARAM_CARRY, `FRAME+20(%esp)') +define(PARAM_MULTIPLIER, `FRAME+16(%esp)') +define(PARAM_SIZE, `FRAME+12(%esp)') +define(PARAM_SRC, `FRAME+8(%esp)') +define(PARAM_DST, `FRAME+4(%esp)') + + TEXT + ALIGN(32) +PROLOGUE(M4_function_1) +deflit(`FRAME',0) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_SRC, %esi + movl PARAM_SIZE, %ecx + movl PARAM_DST, %edi + + movl (%esi), %eax + mull PARAM_MULTIPLIER + + testb $1, %cl + jnz L(bx1) + +L(bx0): movl %eax, %ebx + movl %edx, %ebp + shrl $2, %ecx + jnc L(lo0) + +L(b10): leal -8(%esi), %esi + leal -8(%edi), %edi + incl %ecx + jmp L(lo2) + +L(bx1): movl %eax, %ebp + movl %edx, %ebx + shrl $2, %ecx + jc L(b11) + +L(b01): leal 4(%edi), %edi + jz L(end) + leal 4(%esi), %esi + jmp L(top) + +L(b11): leal -4(%esi), %esi + leal -4(%edi), %edi + incl %ecx + jmp L(lo3) + + ALIGN(16) +L(top): movl (%esi), %eax + mull PARAM_MULTIPLIER + ADDSUB %ebp, -4(%edi) + adcl %eax, %ebx + movl $0, %ebp + adcl %edx, %ebp +L(lo0): movl 4(%esi), %eax + mull PARAM_MULTIPLIER + ADDSUB %ebx, (%edi) + adcl %eax, %ebp + movl $0, %ebx + adcl %edx, %ebx +L(lo3): movl 8(%esi), %eax + mull PARAM_MULTIPLIER + ADDSUB %ebp, 4(%edi) + adcl %eax, %ebx + movl $0, %ebp + adcl %edx, %ebp +L(lo2): movl 12(%esi), %eax + mull PARAM_MULTIPLIER + ADDSUB %ebx, 8(%edi) + adcl %eax, %ebp + movl $0, %ebx + adcl %edx, %ebx + + leal 16(%esi), %esi + leal 16(%edi), %edi + decl %ecx + jnz L(top) + +L(end): xor %eax, %eax + ADDSUB %ebp, -4(%edi) + adcl %ebx, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() +ifdef(`OPERATION_addmul_1',` + ALIGN(32) +PROLOGUE(M4_function_1c) +deflit(`FRAME',0) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_SRC, %esi + movl PARAM_SIZE, %ecx + movl PARAM_DST, %edi + + movl (%esi), %eax + mull PARAM_MULTIPLIER + + testb $1, %cl + jnz L(cx1) + + movl PARAM_CARRY, %ebx + xorl %ebp, %ebp + +L(cx0): addl %eax, %ebx + adcl %edx, %ebp + shrl $2, %ecx + jnc L(lo0) + +L(c10): leal -8(%esi), %esi + leal -8(%edi), %edi + incl %ecx + jmp L(lo2) + +L(cx1): movl PARAM_CARRY, %ebp + xorl %ebx, %ebx + + addl %eax, %ebp + adcl %edx, %ebx + shrl $2, %ecx + jc L(c11) + +L(c01): leal 4(%edi), %edi + jz L(end) + leal 4(%esi), %esi + jmp L(top) + +L(c11): leal -4(%esi), %esi + leal -4(%edi), %edi + incl %ecx + jmp L(lo3) +EPILOGUE() +') diff --git a/gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm new file mode 100644 index 0000000..cd1a650 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/aorrlsh1_n.asm @@ -0,0 +1,53 @@ +dnl Intel Atom mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Contributed to the GNU project by Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 31) + +ifdef(`OPERATION_addlsh1_n', ` + define(M4_inst, adc) + define(M4_opp, sub) + define(M4_function, mpn_addlsh1_n) + define(M4_function_c, mpn_addlsh1_nc) +',`ifdef(`OPERATION_rsblsh1_n', ` + define(M4_inst, sbb) + define(M4_opp, add) + define(M4_function, mpn_rsblsh1_n) + define(M4_function_c, mpn_rsblsh1_nc) +',`m4_error(`Need OPERATION_addlsh1_n or OPERATION_rsblsh1_n +')')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) + +include_mpn(`x86/atom/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm new file mode 100644 index 0000000..10f4419 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/aorrlsh2_n.asm @@ -0,0 +1,53 @@ +dnl Intel Atom mpn_addlsh2_n/mpn_rsblsh2_n -- rp[] = (vp[] << 2) +- up[] + +dnl Contributed to the GNU project by Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 30) + +ifdef(`OPERATION_addlsh2_n', ` + define(M4_inst, adcl) + define(M4_opp, subl) + define(M4_function, mpn_addlsh2_n) + define(M4_function_c, mpn_addlsh2_nc) +',`ifdef(`OPERATION_rsblsh2_n', ` + define(M4_inst, sbbl) + define(M4_opp, addl) + define(M4_function, mpn_rsblsh2_n) + define(M4_function_c, mpn_rsblsh2_nc) +',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_rsblsh2_n +')')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n mpn_rsblsh2_nc) + +include_mpn(`x86/atom/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm new file mode 100644 index 0000000..71cfe49 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/aorrlshC_n.asm @@ -0,0 +1,156 @@ +dnl Intel Atom mpn_addlshC_n/mpn_rsblshC_n -- rp[] = (vp[] << C) +- up[] + +dnl Contributed to the GNU project by Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C mp_limb_t mpn_rsblshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t mpn_rsblshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_signed_limb_t carry); + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 6 +C AMD K6 +C AMD K7 +C AMD K8 +C AMD K10 + +defframe(PARAM_CORB, 20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_DBLD, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_COUNT,`PARAM_SIZE') +define(SAVE_EBP,`PARAM_DBLD') +define(SAVE_VP,`PARAM_SRC') +define(SAVE_UP,`PARAM_DST') + +define(M, eval(m4_lshift(1,LSH))) +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebx') + +ASM_START() + TEXT + ALIGN(8) + +PROLOGUE(M4_function_c) +deflit(`FRAME',0) + movl PARAM_CORB, %eax + movl %eax, %edx + shr $LSH, %edx + andl $1, %edx + M4_opp %edx, %eax + jmp L(start_nc) +EPILOGUE() + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + xor %eax, %eax + xor %edx, %edx +L(start_nc): + push rp FRAME_pushl() + + mov PARAM_SIZE, %ecx C size + mov PARAM_DST, rp + mov up, SAVE_UP + incl %ecx C size + 1 + mov PARAM_SRC, up + mov vp, SAVE_VP + shr %ecx C (size+1)\2 + mov PARAM_DBLD, vp + mov %ebp, SAVE_EBP + mov %ecx, VAR_COUNT + jnc L(entry) C size odd + + shr %edx C size even + mov (vp), %ecx + lea 4(vp), vp + lea (%eax,%ecx,M), %edx + mov %ecx, %eax + lea -4(up), up + lea -4(rp), rp + jmp L(enteven) + + ALIGN(16) +L(oop): + lea (%eax,%ecx,M), %ebp + shr $RSH, %ecx + mov 4(vp), %eax + shr %edx + lea 8(vp), vp + M4_inst (up), %ebp + lea (%ecx,%eax,M), %edx + mov %ebp, (rp) +L(enteven): + M4_inst 4(up), %edx + lea 8(up), up + mov %edx, 4(rp) + adc %edx, %edx + shr $RSH, %eax + lea 8(rp), rp +L(entry): + mov (vp), %ecx + decl VAR_COUNT + jnz L(oop) + + lea (%eax,%ecx,M), %ebp + shr $RSH, %ecx + shr %edx + mov SAVE_VP, vp + M4_inst (up), %ebp + mov %ecx, %eax + mov SAVE_UP, up + M4_inst $0, %eax + mov %ebp, (rp) + mov SAVE_EBP, %ebp + pop rp FRAME_popl() + ret +EPILOGUE() + +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/aors_n.asm b/gmp-6.3.0/mpn/x86/atom/aors_n.asm new file mode 100644 index 0000000..45ec287 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/aors_n.asm @@ -0,0 +1,159 @@ +dnl Intel Atom mpn_add_n/mpn_sub_n -- rp[] = up[] +- vp[]. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 3 +C AMD K6 +C AMD K7 +C AMD K8 +C AMD K10 + +ifdef(`OPERATION_add_n', ` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + define(M4_description, add) +',`ifdef(`OPERATION_sub_n', ` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + define(M4_description, subtract) +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C Calculate src1,size M4_description src2,size, and store the result in +C dst,size. The return value is the carry bit from the top of the result (1 +C or 0). +C +C The _nc version accepts 1 or 0 for an initial carry into the low limb of +C the calculation. Note values other than 1 or 0 here will lead to garbage +C results. + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_RP,`PARAM_SIZE') +define(SAVE_VP,`PARAM_SRC1') +define(SAVE_UP,`PARAM_DST') + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebx') +define(`cy', `%ecx') +define(`r1', `%ecx') +define(`r2', `%edx') + +ASM_START() + TEXT + ALIGN(16) +deflit(`FRAME',0) + +PROLOGUE(M4_function_n) + xor cy, cy C carry +L(start): + mov PARAM_SIZE, %eax C size + mov rp, SAVE_RP + mov PARAM_DST, rp + mov up, SAVE_UP + mov PARAM_SRC1, up + shr %eax C size >> 1 + mov vp, SAVE_VP + mov PARAM_SRC2, vp + jz L(one) C size == 1 + jc L(three) C size % 2 == 1 + + shr cy + mov (up), r2 + lea 4(up), up + lea 4(vp), vp + lea -4(rp), rp + jmp L(entry) +L(one): + shr cy + mov (up), r1 + jmp L(end) +L(three): + shr cy + mov (up), r1 + + ALIGN(16) +L(oop): + M4_inst (vp), r1 + lea 8(up), up + mov -4(up), r2 + lea 8(vp), vp + mov r1, (rp) +L(entry): + M4_inst -4(vp), r2 + lea 8(rp), rp + dec %eax + mov (up), r1 + mov r2, -4(rp) + jnz L(oop) + +L(end): C %eax is zero here + mov SAVE_UP, up + M4_inst (vp), r1 + mov SAVE_VP, vp + mov r1, (rp) + adc %eax, %eax + mov SAVE_RP, rp + ret +EPILOGUE() + +PROLOGUE(M4_function_nc) + mov PARAM_CARRY, cy C carry + jmp L(start) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm b/gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm new file mode 100644 index 0000000..75ace65 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/aorslshC_n.asm @@ -0,0 +1,247 @@ +dnl Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C) + +dnl Contributed to the GNU project by Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size); +C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); +C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,); +C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_signed_limb_t borrow); + +defframe(PARAM_CORB, 16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size,); +C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size,); +C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t borrow); + +C if src1 == dst, _ip1 is used + +C cycles/limb +C dst!=src1,src2 dst==src1 +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 7 6 +C AMD K6 +C AMD K7 +C AMD K8 +C AMD K10 + +defframe(GPARAM_CORB, 20) +defframe(GPARAM_SIZE, 16) +defframe(GPARAM_SRC2, 12) + +dnl re-use parameter space +define(SAVE_EBP,`PARAM_SIZE') +define(SAVE_EBX,`PARAM_SRC') +define(SAVE_UP,`PARAM_DST') + +define(M, eval(m4_lshift(1,LSH))) +define(`rp', `%edi') +define(`up', `%esi') + +ASM_START() + TEXT + ALIGN(8) + +PROLOGUE(M4_ip_function_c) +deflit(`FRAME',0) + movl PARAM_CORB, %ecx + movl %ecx, %edx + shr $LSH, %edx + andl $1, %edx + M4_opp %edx, %ecx + jmp L(start_nc) +EPILOGUE() + +PROLOGUE(M4_ip_function) +deflit(`FRAME',0) + + xor %ecx, %ecx + xor %edx, %edx +L(start_nc): + push rp FRAME_pushl() + mov PARAM_DST, rp + mov up, SAVE_UP + mov PARAM_SRC, up + mov %ebx, SAVE_EBX + mov PARAM_SIZE, %ebx C size +L(inplace): + incl %ebx C size + 1 + shr %ebx C (size+1)\2 + mov %ebp, SAVE_EBP + jnc L(entry) C size odd + + add %edx, %edx C size even + mov %ecx, %ebp + mov (up), %ecx + lea -4(rp), rp + lea (%ebp,%ecx,M), %eax + lea 4(up), up + jmp L(enteven) + + ALIGN(16) +L(oop): + lea (%ecx,%eax,M), %ebp + shr $RSH, %eax + mov 4(up), %ecx + add %edx, %edx + lea 8(up), up + M4_inst %ebp, (rp) + lea (%eax,%ecx,M), %eax + +L(enteven): + M4_inst %eax, 4(rp) + lea 8(rp), rp + + sbb %edx, %edx + shr $RSH, %ecx + +L(entry): + mov (up), %eax + decl %ebx + jnz L(oop) + + lea (%ecx,%eax,M), %ebp + shr $RSH, %eax + shr %edx + M4_inst %ebp, (rp) + mov SAVE_UP, up + adc $0, %eax + mov SAVE_EBP, %ebp + mov SAVE_EBX, %ebx + pop rp FRAME_popl() + ret +EPILOGUE() + +PROLOGUE(M4_function_c) +deflit(`FRAME',0) + movl GPARAM_CORB, %ecx + movl %ecx, %edx + shr $LSH, %edx + andl $1, %edx + M4_opp %edx, %ecx + jmp L(generic_nc) +EPILOGUE() + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + xor %ecx, %ecx + xor %edx, %edx +L(generic_nc): + push rp FRAME_pushl() + mov PARAM_DST, rp + mov up, SAVE_UP + mov PARAM_SRC, up + cmp rp, up + mov %ebx, SAVE_EBX + jne L(general) + mov GPARAM_SIZE, %ebx C size + mov GPARAM_SRC2, up + jmp L(inplace) + +L(general): + mov GPARAM_SIZE, %eax C size + mov %ebx, SAVE_EBX + incl %eax C size + 1 + mov up, %ebx C vp + mov GPARAM_SRC2, up C up + shr %eax C (size+1)\2 + mov %ebp, SAVE_EBP + mov %eax, GPARAM_SIZE + jnc L(entry2) C size odd + + add %edx, %edx C size even + mov %ecx, %ebp + mov (up), %ecx + lea -4(rp), rp + lea -4(%ebx), %ebx + lea (%ebp,%ecx,M), %eax + lea 4(up), up + jmp L(enteven2) + + ALIGN(16) +L(oop2): + lea (%ecx,%eax,M), %ebp + shr $RSH, %eax + mov 4(up), %ecx + add %edx, %edx + lea 8(up), up + mov (%ebx), %edx + M4_inst %ebp, %edx + lea (%eax,%ecx,M), %eax + mov %edx, (rp) +L(enteven2): + mov 4(%ebx), %edx + lea 8(%ebx), %ebx + M4_inst %eax, %edx + mov %edx, 4(rp) + sbb %edx, %edx + shr $RSH, %ecx + lea 8(rp), rp +L(entry2): + mov (up), %eax + decl GPARAM_SIZE + jnz L(oop2) + + lea (%ecx,%eax,M), %ebp + shr $RSH, %eax + shr %edx + mov (%ebx), %edx + M4_inst %ebp, %edx + mov %edx, (rp) + mov SAVE_UP, up + adc $0, %eax + mov SAVE_EBP, %ebp + mov SAVE_EBX, %ebx + pop rp FRAME_popl() + ret +EPILOGUE() + +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm new file mode 100644 index 0000000..31e908e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/bdiv_q_1.asm @@ -0,0 +1,35 @@ +dnl Intel Atom mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel +dnl division by 1-limb divisor, returning quotient only. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +include_mpn(`x86/pentium/bdiv_q_1.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm b/gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm new file mode 100644 index 0000000..50bf2ad --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/cnd_add_n.asm @@ -0,0 +1,113 @@ +dnl X86 mpn_cnd_add_n optimised for Intel Atom. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) ? +C P4 model 3-4 (Prescott) ? +C Intel atom 4.67 +C AMD K6 ? +C AMD K7 ? +C AMD K8 ? + + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebp') +define(`n', `%ecx') +define(`cnd', `20(%esp)') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_cnd_add_n) + push %edi + push %esi + push %ebx + push %ebp + + mov cnd, %eax C make cnd into a mask (1) + mov 24(%esp), rp + neg %eax C make cnd into a mask (1) + mov 28(%esp), up + sbb %eax, %eax C make cnd into a mask (1) + mov 32(%esp), vp + mov %eax, cnd C make cnd into a mask (1) + mov 36(%esp), n + + xor %edx, %edx + + shr $1, n + jnc L(top) + + mov 0(vp), %eax + and cnd, %eax + lea 4(vp), vp + add 0(up), %eax + lea 4(rp), rp + lea 4(up), up + sbb %edx, %edx + mov %eax, -4(rp) + inc n + dec n + je L(end) + +L(top): sbb %edx, %edx + mov 0(vp), %eax + and cnd, %eax + lea 8(vp), vp + lea 8(rp), rp + mov -4(vp), %ebx + and cnd, %ebx + add %edx, %edx + adc 0(up), %eax + lea 8(up), up + mov %eax, -8(rp) + adc -4(up), %ebx + dec n + mov %ebx, -4(rp) + jne L(top) + +L(end): mov $0, %eax + adc %eax, %eax + + pop %ebp + pop %ebx + pop %esi + pop %edi + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm new file mode 100644 index 0000000..221bedc --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/cnd_sub_n.asm @@ -0,0 +1,124 @@ +dnl X86 mpn_cnd_sub_n optimised for Intel Atom. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) ? +C P4 model 3-4 (Prescott) ? +C Intel atom 5.67 +C AMD K6 ? +C AMD K7 ? +C AMD K8 ? + + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebp') +define(`n', `%ecx') +define(`cnd', `20(%esp)') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_cnd_sub_n) + push %edi + push %esi + push %ebx + push %ebp + + mov cnd, %eax C make cnd into a mask (1) + mov 24(%esp), rp + neg %eax C make cnd into a mask (1) + mov 28(%esp), up + sbb %eax, %eax C make cnd into a mask (1) + mov 32(%esp), vp + mov %eax, cnd C make cnd into a mask (1) + mov 36(%esp), n + + xor %edx, %edx + + inc n + shr n + jnc L(ent) + + mov 0(vp), %eax + and cnd, %eax + lea 4(vp), vp + mov 0(up), %edx + sub %eax, %edx + lea 4(rp), rp + lea 4(up), up + mov %edx, -4(rp) + sbb %edx, %edx C save cy + +L(ent): mov 0(vp), %ebx + and cnd, %ebx + add %edx, %edx C restore cy + mov 0(up), %edx + dec n + je L(end) + +L(top): sbb %ebx, %edx + mov 4(vp), %eax + mov %edx, 0(rp) + sbb %edx, %edx C save cy + mov 8(vp), %ebx + lea 8(up), up + and cnd, %ebx + and cnd, %eax + add %edx, %edx C restore cy + mov -4(up), %edx + lea 8(rp), rp + sbb %eax, %edx + mov %edx, -4(rp) + dec n + mov 0(up), %edx + lea 8(vp), vp + jne L(top) + +L(end): sbb %ebx, %edx + mov %edx, 0(rp) + + mov $0, %eax + adc %eax, %eax + + pop %ebp + pop %ebx + pop %esi + pop %edi + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/dive_1.asm b/gmp-6.3.0/mpn/x86/atom/dive_1.asm new file mode 100644 index 0000000..71036a1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/dive_1.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_divexact_1) +include_mpn(`x86/pentium/dive_1.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/gmp-mparam.h b/gmp-6.3.0/mpn/x86/atom/gmp-mparam.h new file mode 100644 index 0000000..e025bb7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/gmp-mparam.h @@ -0,0 +1,214 @@ +/* Intel Atom/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1600 MHz Diamondville (Atom 330) */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 5 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 11 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 17 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 72.60% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 35 + +#define DIV_1_VS_MUL_1_PERCENT 236 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 178 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 399 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 126 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 115 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 366 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 50 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ + { 159,11}, { 95,10}, { 191, 9}, { 383,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 351, 9}, { 703,10}, \ + { 367, 9}, { 735,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415,11}, { 223,10}, { 447,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,10}, \ + { 735,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 735,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1279,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 831,11}, { 1663,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1215,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1599,13}, { 895,12}, { 1791,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2431,13}, { 1407,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3839,15}, \ + { 1023,14}, { 2047,13}, { 4223,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 158 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 368, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 13, 5}, { 27, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 15, 6}, { 31, 7}, { 17, 6}, \ + { 35, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135,10}, { 79, 9}, { 159, 8}, \ + { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511, 9}, { 271,10}, \ + { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,10}, { 303, 9}, { 607,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415,11}, { 223,10}, { 447,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,13}, \ + { 383,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1599,13}, { 895,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \ + { 767,13}, { 1663,12}, { 3455,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4351,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3839,15}, { 1023,14}, \ + { 2047,13}, { 4351,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 161 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 56 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 111 +#define SQRLO_SQR_THRESHOLD 6654 + +#define DC_DIV_QR_THRESHOLD 67 +#define DC_DIVAPPR_Q_THRESHOLD 252 +#define DC_BDIV_QR_THRESHOLD 63 +#define DC_BDIV_Q_THRESHOLD 172 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 250 +#define INV_APPR_THRESHOLD 250 + +#define BINV_NEWTON_THRESHOLD 276 +#define REDC_1_TO_REDC_N_THRESHOLD 68 + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1442 +#define MUPI_DIV_QR_THRESHOLD 116 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1341 + +#define POWM_SEC_TABLE 1,16,98,376,1259 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 23 +#define SET_STR_DC_THRESHOLD 298 +#define SET_STR_PRECOMPUTE_THRESHOLD 1037 + +#define FAC_DSC_THRESHOLD 171 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 3 /* 3.71% faster than 1 */ +#define HGCD_THRESHOLD 128 +#define HGCD_APPR_THRESHOLD 186 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 339 +#define JACOBI_BASE_METHOD 3 /* 2.58% faster than 2 */ + +/* Tuneup completed successfully, took 214190 seconds */ diff --git a/gmp-6.3.0/mpn/x86/atom/logops_n.asm b/gmp-6.3.0/mpn/x86/atom/logops_n.asm new file mode 100644 index 0000000..3cb6d73 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/logops_n.asm @@ -0,0 +1,151 @@ +dnl Intel Atom mpn_and_n,...,mpn_xnor_n -- bitwise logical operations. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C op nop opn +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 3 3.5 3.5 +C AMD K6 +C AMD K7 +C AMD K8 +C AMD K10 + +define(M4_choose_op, +`ifdef(`OPERATION_$1',` +define(`M4_function', `mpn_$1') +define(`M4_want_pre', `$4') +define(`M4_inst', `$3') +define(`M4_want_post',`$2') +')') +define(M4pre, `ifelse(M4_want_pre, yes,`$1')') +define(M4post,`ifelse(M4_want_post,yes,`$1')') + +M4_choose_op( and_n, , andl, ) +M4_choose_op( andn_n, , andl, yes) +M4_choose_op( nand_n, yes, andl, ) +M4_choose_op( ior_n, , orl, ) +M4_choose_op( iorn_n, , orl, yes) +M4_choose_op( nior_n, yes, orl, ) +M4_choose_op( xor_n, , xorl, ) +M4_choose_op( xnor_n, yes, xorl, ) + +ifdef(`M4_function',, +`m4_error(`Unrecognised or undefined OPERATION symbol +')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +C void M4_function (mp_ptr dst, mp_srcptr src2, mp_srcptr src1, mp_size_t size); +C + +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC1, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_RP,`PARAM_SIZE') +define(SAVE_VP,`PARAM_SRC1') +define(SAVE_UP,`PARAM_DST') + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebx') +define(`cnt', `%eax') +define(`r1', `%ecx') +define(`r2', `%edx') + +ASM_START() + TEXT + ALIGN(16) +deflit(`FRAME',0) + +PROLOGUE(M4_function) + mov PARAM_SIZE, cnt C size + mov rp, SAVE_RP + mov PARAM_DST, rp + mov up, SAVE_UP + mov PARAM_SRC1, up + shr cnt C size >> 1 + mov vp, SAVE_VP + mov PARAM_SRC2, vp + mov (up), r1 + jz L(end) C size == 1 + jnc L(even) C size % 2 == 0 + + ALIGN(16) +L(oop): +M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)') + M4_inst (vp), r1 + lea 8(up), up + mov -4(up), r2 +M4post(` notl_or_xorl_GMP_NUMB_MASK(r1)') + lea 8(vp), vp + mov r1, (rp) +L(entry): +M4pre(` notl_or_xorl_GMP_NUMB_MASK(r2)') + M4_inst -4(vp), r2 + lea 8(rp), rp +M4post(` notl_or_xorl_GMP_NUMB_MASK(r2)') + dec cnt + mov (up), r1 + mov r2, -4(rp) + jnz L(oop) + +L(end): +M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)') + mov SAVE_UP, up + M4_inst (vp), r1 +M4post(`notl_or_xorl_GMP_NUMB_MASK(r1)') + mov SAVE_VP, vp + mov r1, (rp) + mov SAVE_RP, rp + ret + +L(even): + mov r1, r2 + lea 4(up), up + lea 4(vp), vp + lea -4(rp), rp + jmp L(entry) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/lshift.asm b/gmp-6.3.0/mpn/x86/atom/lshift.asm new file mode 100644 index 0000000..f2c70dd --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/lshift.asm @@ -0,0 +1,218 @@ +dnl Intel Atom mpn_lshift -- mpn left shift. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned cnt); + +C cycles/limb +C cnt!=1 cnt==1 +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 5 2.5 +C AMD K6 +C AMD K7 +C AMD K8 +C AMD K10 + +defframe(PARAM_CNT, 16) +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_UP,`PARAM_CNT') +define(VAR_COUNT,`PARAM_SIZE') +define(SAVE_EBX,`PARAM_SRC') +define(SAVE_EBP,`PARAM_DST') + +define(`rp', `%edi') +define(`up', `%esi') +define(`cnt', `%ecx') + +ASM_START() + TEXT + ALIGN(8) +deflit(`FRAME',0) +PROLOGUE(mpn_lshift) + mov PARAM_CNT, cnt + mov PARAM_SIZE, %edx + mov up, SAVE_UP + mov PARAM_SRC, up + push rp FRAME_pushl() + mov PARAM_DST, rp + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,cnt + jne L(normal) + cmpl rp, up + jnc L(special) C jump if s_ptr + 1 >= res_ptr + leal (up,%edx,4),%eax + cmpl %eax,rp + jnc L(special) C jump if res_ptr >= s_ptr + size + +L(normal): + lea -4(up,%edx,4), up + mov %ebx, SAVE_EBX + lea -4(rp,%edx,4), rp + + shr %edx + mov (up), %eax + mov %edx, VAR_COUNT + jnc L(evn) + + mov %eax, %ebx + shl %cl, %ebx + neg cnt + shr %cl, %eax + test %edx, %edx + jnz L(gt1) + mov %ebx, (rp) + jmp L(quit) + +L(gt1): mov %ebp, SAVE_EBP + push %eax + mov -4(up), %eax + mov %eax, %ebp + shr %cl, %eax + jmp L(lo1) + +L(evn): mov %ebp, SAVE_EBP + neg cnt + mov %eax, %ebp + mov -4(up), %edx + shr %cl, %eax + mov %edx, %ebx + shr %cl, %edx + neg cnt + decl VAR_COUNT + lea 4(rp), rp + lea -4(up), up + jz L(end) + push %eax FRAME_pushl() + + ALIGN(8) +L(top): shl %cl, %ebp + or %ebp, %edx + shl %cl, %ebx + neg cnt + mov -4(up), %eax + mov %eax, %ebp + mov %edx, -4(rp) + shr %cl, %eax + lea -8(rp), rp +L(lo1): mov -8(up), %edx + or %ebx, %eax + mov %edx, %ebx + shr %cl, %edx + lea -8(up), up + neg cnt + mov %eax, (rp) + decl VAR_COUNT + jg L(top) + + pop %eax FRAME_popl() +L(end): + shl %cl, %ebp + shl %cl, %ebx + or %ebp, %edx + mov SAVE_EBP, %ebp + mov %edx, -4(rp) + mov %ebx, -8(rp) + +L(quit): + mov SAVE_UP, up + mov SAVE_EBX, %ebx + pop rp FRAME_popl() + ret + +L(special): +deflit(`FRAME',4) + lea 3(%edx), %eax C size + 3 + dec %edx C size - 1 + mov (up), %ecx + shr $2, %eax C (size + 3) / 4 + and $3, %edx C (size - 1) % 4 + jz L(goloop) C jmp if size == 1 (mod 4) + shr %edx + jnc L(odd) C jum if size == 3 (mod 4) + + add %ecx, %ecx + lea 4(up), up + mov %ecx, (rp) + mov (up), %ecx + lea 4(rp), rp + + dec %edx + jnz L(goloop) C jump if size == 0 (mod 4) +L(odd): lea -8(up), up + lea -8(rp), rp + jmp L(sentry) C reached if size == 2 or 3 (mod 4) + +L(sloop): + adc %ecx, %ecx + mov 4(up), %edx + mov %ecx, (rp) + adc %edx, %edx + mov 8(up), %ecx + mov %edx, 4(rp) +L(sentry): + adc %ecx, %ecx + mov 12(up), %edx + mov %ecx, 8(rp) + adc %edx, %edx + lea 16(up), up + mov %edx, 12(rp) + lea 16(rp), rp + mov (up), %ecx +L(goloop): + decl %eax + jnz L(sloop) + +L(squit): + adc %ecx, %ecx + mov %ecx, (rp) + adc %eax, %eax + + mov SAVE_UP, up + pop rp FRAME_popl() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/lshiftc.asm b/gmp-6.3.0/mpn/x86/atom/lshiftc.asm new file mode 100644 index 0000000..5be53ed --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/lshiftc.asm @@ -0,0 +1,159 @@ +dnl Intel Atom mpn_lshiftc -- mpn left shift with complement. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mp_limb_t mpn_lshiftc (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned cnt); + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 5.5 +C AMD K6 +C AMD K7 +C AMD K8 +C AMD K10 + +defframe(PARAM_CNT, 16) +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_UP,`PARAM_CNT') +define(VAR_COUNT,`PARAM_SIZE') +define(SAVE_EBX,`PARAM_SRC') +define(SAVE_EBP,`PARAM_DST') + +define(`rp', `%edi') +define(`up', `%esi') +define(`cnt', `%ecx') + +ASM_START() + TEXT + +PROLOGUE(mpn_lshiftc) +deflit(`FRAME',0) + mov PARAM_CNT, cnt + mov PARAM_SIZE, %edx + mov up, SAVE_UP + mov PARAM_SRC, up + push rp FRAME_pushl() + mov PARAM_DST, rp + + lea -4(up,%edx,4), up + mov %ebx, SAVE_EBX + lea -4(rp,%edx,4), rp + + shr %edx + mov (up), %eax + mov %edx, VAR_COUNT + jnc L(evn) + + mov %eax, %ebx + shl %cl, %ebx + neg cnt + shr %cl, %eax + test %edx, %edx + jnz L(gt1) + not %ebx + mov %ebx, (rp) + jmp L(quit) + +L(gt1): mov %ebp, SAVE_EBP + push %eax + mov -4(up), %eax + mov %eax, %ebp + shr %cl, %eax + jmp L(lo1) + +L(evn): mov %ebp, SAVE_EBP + neg cnt + mov %eax, %ebp + mov -4(up), %edx + shr %cl, %eax + mov %edx, %ebx + shr %cl, %edx + neg cnt + decl VAR_COUNT + lea 4(rp), rp + lea -4(up), up + jz L(end) + push %eax FRAME_pushl() + +L(top): shl %cl, %ebp + or %ebp, %edx + shl %cl, %ebx + neg cnt + not %edx + mov -4(up), %eax + mov %eax, %ebp + mov %edx, -4(rp) + shr %cl, %eax + lea -8(rp), rp +L(lo1): mov -8(up), %edx + or %ebx, %eax + mov %edx, %ebx + shr %cl, %edx + not %eax + lea -8(up), up + neg cnt + mov %eax, (rp) + decl VAR_COUNT + jg L(top) + + pop %eax FRAME_popl() +L(end): + shl %cl, %ebp + shl %cl, %ebx + or %ebp, %edx + mov SAVE_EBP, %ebp + not %edx + not %ebx + mov %edx, -4(rp) + mov %ebx, -8(rp) + +L(quit): + mov SAVE_UP, up + mov SAVE_EBX, %ebx + pop rp FRAME_popl() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm b/gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm new file mode 100644 index 0000000..b80fb03 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/mmx/copyd.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86/k7/mmx/copyd.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm b/gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm new file mode 100644 index 0000000..49b6b8d --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/mmx/copyi.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86/k7/mmx/copyi.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm b/gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm new file mode 100644 index 0000000..3fe8253 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/mmx/hamdist.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_hamdist -- hamming distance. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86/k7/mmx/popham.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm new file mode 100644 index 0000000..6d57ba3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/mod_34lsub1.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mod_34lsub1) +include_mpn(`x86/p6/mod_34lsub1.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/mode1o.asm b/gmp-6.3.0/mpn/x86/atom/mode1o.asm new file mode 100644 index 0000000..c9ee6bd --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/mode1o.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_modexact_1_odd mpn_modexact_1c_odd) +include_mpn(`x86/pentium/mode1o.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/rshift.asm b/gmp-6.3.0/mpn/x86/atom/rshift.asm new file mode 100644 index 0000000..1cb5dbe --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/rshift.asm @@ -0,0 +1,152 @@ +dnl Intel Atom mpn_rshift -- mpn right shift. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Converted from AMD64 by Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned cnt); + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 5 +C AMD K6 +C AMD K7 +C AMD K8 +C AMD K10 + +defframe(PARAM_CNT, 16) +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_UP,`PARAM_CNT') +define(VAR_COUNT,`PARAM_SIZE') +define(SAVE_EBX,`PARAM_SRC') +define(SAVE_EBP,`PARAM_DST') + +define(`rp', `%edi') +define(`up', `%esi') +define(`cnt', `%ecx') + +ASM_START() + TEXT + ALIGN(8) +deflit(`FRAME',0) +PROLOGUE(mpn_rshift) + mov PARAM_CNT, cnt + mov PARAM_SIZE, %edx + mov up, SAVE_UP + mov PARAM_SRC, up + push rp FRAME_pushl() + mov PARAM_DST, rp + mov %ebx, SAVE_EBX + + shr %edx + mov (up), %eax + mov %edx, VAR_COUNT + jnc L(evn) + + mov %eax, %ebx + shr %cl, %ebx + neg cnt + shl %cl, %eax + test %edx, %edx + jnz L(gt1) + mov %ebx, (rp) + jmp L(quit) + +L(gt1): mov %ebp, SAVE_EBP + push %eax + mov 4(up), %eax + mov %eax, %ebp + shl %cl, %eax + jmp L(lo1) + +L(evn): mov %ebp, SAVE_EBP + neg cnt + mov %eax, %ebp + mov 4(up), %edx + shl %cl, %eax + mov %edx, %ebx + shl %cl, %edx + neg cnt + decl VAR_COUNT + lea -4(rp), rp + lea 4(up), up + jz L(end) + push %eax FRAME_pushl() + + ALIGN(8) +L(top): shr %cl, %ebp + or %ebp, %edx + shr %cl, %ebx + neg cnt + mov 4(up), %eax + mov %eax, %ebp + mov %edx, 4(rp) + shl %cl, %eax + lea 8(rp), rp +L(lo1): mov 8(up), %edx + or %ebx, %eax + mov %edx, %ebx + shl %cl, %edx + lea 8(up), up + neg cnt + mov %eax, (rp) + decl VAR_COUNT + jg L(top) + + pop %eax FRAME_popl() +L(end): + shr %cl, %ebp + shr %cl, %ebx + or %ebp, %edx + mov SAVE_EBP, %ebp + mov %edx, 4(rp) + mov %ebx, 8(rp) + +L(quit): + mov SAVE_UP, up + mov SAVE_EBX, %ebx + pop rp FRAME_popl() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm b/gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm new file mode 100644 index 0000000..969a14a --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/aorsmul_1.asm @@ -0,0 +1,174 @@ +dnl x86-32 mpn_addmul_1 and mpn_submul_1 optimised for Intel Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C cycles/limb +C P5 - +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 8 +C AMD K6 +C AMD K7 - +C AMD K8 +C AMD K10 + +define(`rp', `%edi') +define(`up', `%esi') +define(`n', `%ecx') + +ifdef(`OPERATION_addmul_1',` + define(ADDSUB, add) + define(func_1, mpn_addmul_1) + define(func_1c, mpn_addmul_1c)') +ifdef(`OPERATION_submul_1',` + define(ADDSUB, sub) + define(func_1, mpn_submul_1) + define(func_1c, mpn_submul_1c)') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + TEXT + ALIGN(16) +PROLOGUE(func_1) + xor %edx, %edx +L(ent): push %edi + push %esi + push %ebx + mov 16(%esp), rp + mov 20(%esp), up + mov 24(%esp), n + movd 28(%esp), %mm7 + test $1, n + jz L(fi0or2) + movd (up), %mm0 + pmuludq %mm7, %mm0 + shr $2, n + jnc L(fi1) + +L(fi3): lea -8(up), up + lea -8(rp), rp + movd 12(up), %mm1 + movd %mm0, %ebx + pmuludq %mm7, %mm1 + add $1, n C increment and clear carry + jmp L(lo3) + +L(fi1): movd %mm0, %ebx + jz L(wd1) + movd 4(up), %mm1 + pmuludq %mm7, %mm1 + jmp L(lo1) + +L(fi0or2): + movd (up), %mm1 + pmuludq %mm7, %mm1 + shr $2, n + movd 4(up), %mm0 + jc L(fi2) + lea -4(up), up + lea -4(rp), rp + movd %mm1, %eax + pmuludq %mm7, %mm0 + jmp L(lo0) + +L(fi2): lea 4(up), up + add $1, n C increment and clear carry + movd %mm1, %eax + lea -12(rp), rp + jmp L(lo2) + +C ALIGN(16) C alignment seems irrelevant +L(top): movd 4(up), %mm1 + adc $0, %edx + ADDSUB %eax, 12(rp) + movd %mm0, %ebx + pmuludq %mm7, %mm1 + lea 16(rp), rp +L(lo1): psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + ADDSUB %ebx, (rp) +L(lo0): psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + movd %mm0, %ebx + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + ADDSUB %eax, 4(rp) +L(lo3): psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + ADDSUB %ebx, 8(rp) +L(lo2): psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + pmuludq %mm7, %mm0 + dec n + jnz L(top) + +L(end): adc n, %edx C n is zero here + ADDSUB %eax, 12(rp) + movd %mm0, %ebx + lea 16(rp), rp +L(wd1): psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %eax + adc n, %eax + ADDSUB %ebx, (rp) + emms + adc n, %eax + pop %ebx + pop %esi + pop %edi + ret +EPILOGUE() +PROLOGUE(func_1c) + mov 20(%esp), %edx C carry + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm new file mode 100644 index 0000000..782e914 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/bdiv_dbm1c.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_bdiv_dbm1. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_bdiv_dbm1c) +include_mpn(`x86/pentium4/sse2/bdiv_dbm1c.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm b/gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm new file mode 100644 index 0000000..f84709a --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/divrem_1.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_preinv_divrem_1 mpn_divrem_1c mpn_divrem_1) +include_mpn(`x86/pentium4/sse2/divrem_1.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm b/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm new file mode 100644 index 0000000..ae6581d --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_1.asm @@ -0,0 +1,34 @@ +dnl Intel Atom/SSE2 mpn_mod_1_1. + +dnl Copyright 2009, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mod_1_1p) +include_mpn(`x86/pentium4/sse2/mod_1_1.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm b/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm new file mode 100644 index 0000000..31faa3f --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/mod_1_4.asm @@ -0,0 +1,34 @@ +dnl Intel Atom/SSE2 mpn_mod_1_4. + +dnl Copyright 2009, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mod_1s_4p) +include_mpn(`x86/pentium4/sse2/mod_1_4.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm b/gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm new file mode 100644 index 0000000..aa3bb97 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/mul_1.asm @@ -0,0 +1,124 @@ +dnl Intel Atom mpn_mul_1. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C cycles/limb +C P5 - +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 7.5 +C AMD K6 - +C AMD K7 - +C AMD K8 +C AMD K10 + +defframe(PARAM_CARRY,20) +defframe(PARAM_MUL, 16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +define(`rp', `%edx') +define(`up', `%esi') +define(`n', `%ecx') + +ASM_START() + TEXT + ALIGN(16) +deflit(`FRAME',0) + +PROLOGUE(mpn_mul_1c) + movd PARAM_CARRY, %mm6 C carry + jmp L(ent) +EPILOGUE() + + ALIGN(8) C for compact code +PROLOGUE(mpn_mul_1) + pxor %mm6, %mm6 +L(ent): push %esi FRAME_pushl() + mov PARAM_SRC, up + mov PARAM_SIZE, %eax C size + movd PARAM_MUL, %mm7 + movd (up), %mm0 + mov %eax, n + and $3, %eax + pmuludq %mm7, %mm0 + mov PARAM_DST, rp + jz L(lo0) + cmp $2, %eax + lea -16(up,%eax,4),up + lea -16(rp,%eax,4),rp + jc L(lo1) + jz L(lo2) + jmp L(lo3) + + ALIGN(16) +L(top): movd (up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp +L(lo0): paddq %mm0, %mm6 + movd 4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, (rp) + psrlq $32, %mm6 +L(lo3): paddq %mm0, %mm6 + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, 4(rp) + psrlq $32, %mm6 +L(lo2): paddq %mm0, %mm6 + movd 12(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, 8(rp) + psrlq $32, %mm6 +L(lo1): paddq %mm0, %mm6 + sub $4, n + movd %mm6, 12(rp) + lea 16(up), up + ja L(top) + + psrlq $32, %mm6 + movd %mm6, %eax + emms + pop %esi FRAME_popl() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm b/gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm new file mode 100644 index 0000000..97d3aeb --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/mul_basecase.asm @@ -0,0 +1,501 @@ +dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in +dnl a third limb vector. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the +C 4 large loops into one; we could use it for the outer loop branch. +C * Optimise code outside of inner loops. +C * Write combined addmul_1 feed-in a wind-down code, and use when iterating +C outer each loop. ("Overlapping software pipelining") +C * Postpone push of ebx until we know vn > 1. Perhaps use caller-saves regs +C for inlined mul_1, allowing us to postpone all pushes. +C * Perhaps write special code for vn <= un < M, for some small M. + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xn, +C mp_srcptr yp, mp_size_t yn); +C + +define(`rp', `%edi') +define(`up', `%esi') +define(`un', `%ecx') +define(`vp', `%ebp') +define(`vn', `36(%esp)') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + push %edi + push %esi + push %ebx + push %ebp + mov 20(%esp), rp + mov 24(%esp), up + mov 28(%esp), un + mov 32(%esp), vp + + movd (up), %mm0 + movd (vp), %mm7 + pmuludq %mm7, %mm0 + pxor %mm6, %mm6 + + mov un, %eax + and $3, %eax + jz L(of0) + cmp $2, %eax + jc L(of1) + jz L(of2) + +C ================================================================ + jmp L(m3) + ALIGN(16) +L(lm3): movd -4(up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp + paddq %mm0, %mm6 + movd (up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -4(rp) + psrlq $32, %mm6 +L(m3): paddq %mm0, %mm6 + movd 4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, (rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, 4(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + sub $4, un + movd %mm6, 8(rp) + lea 16(up), up + ja L(lm3) + + psrlq $32, %mm6 + movd %mm6, 12(rp) + + decl vn + jz L(done) + lea -8(rp), rp + +L(ol3): mov 28(%esp), un + neg un + lea 4(vp), vp + movd (vp), %mm7 C read next V limb + mov 24(%esp), up + lea 16(rp,un,4), rp + + movd (up), %mm0 + pmuludq %mm7, %mm0 + sar $2, un + movd 4(up), %mm1 + movd %mm0, %ebx + pmuludq %mm7, %mm1 + lea -8(up), up + xor %edx, %edx C zero edx and CF + jmp L(a3) + +L(la3): movd 4(up), %mm1 + adc $0, %edx + add %eax, 12(rp) + movd %mm0, %ebx + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + add %ebx, (rp) + psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + movd %mm0, %ebx + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + add %eax, 4(rp) +L(a3): psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + add %ebx, 8(rp) + psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + pmuludq %mm7, %mm0 + inc un + jnz L(la3) + + adc un, %edx C un is zero here + add %eax, 12(rp) + movd %mm0, %ebx + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %eax + adc un, %eax + add %ebx, 16(rp) + adc un, %eax + mov %eax, 20(rp) + + decl vn + jnz L(ol3) + jmp L(done) + +C ================================================================ + ALIGN(16) +L(lm0): movd (up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp +L(of0): paddq %mm0, %mm6 + movd 4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, (rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, 4(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd 12(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, 8(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + sub $4, un + movd %mm6, 12(rp) + lea 16(up), up + ja L(lm0) + + psrlq $32, %mm6 + movd %mm6, 16(rp) + + decl vn + jz L(done) + lea -4(rp), rp + +L(ol0): mov 28(%esp), un + neg un + lea 4(vp), vp + movd (vp), %mm7 C read next V limb + mov 24(%esp), up + lea 20(rp,un,4), rp + + movd (up), %mm1 + pmuludq %mm7, %mm1 + sar $2, un + movd 4(up), %mm0 + lea -4(up), up + movd %mm1, %eax + pmuludq %mm7, %mm0 + xor %edx, %edx C zero edx and CF + jmp L(a0) + +L(la0): movd 4(up), %mm1 + adc $0, %edx + add %eax, 12(rp) + movd %mm0, %ebx + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + add %ebx, (rp) +L(a0): psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + movd %mm0, %ebx + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + add %eax, 4(rp) + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + add %ebx, 8(rp) + psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + pmuludq %mm7, %mm0 + inc un + jnz L(la0) + + adc un, %edx C un is zero here + add %eax, 12(rp) + movd %mm0, %ebx + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %eax + adc un, %eax + add %ebx, 16(rp) + adc un, %eax + mov %eax, 20(rp) + + decl vn + jnz L(ol0) + jmp L(done) + +C ================================================================ + ALIGN(16) +L(lm1): movd -12(up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp + paddq %mm0, %mm6 + movd -8(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -12(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd -4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -8(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd (up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -4(rp) + psrlq $32, %mm6 +L(of1): paddq %mm0, %mm6 + sub $4, un + movd %mm6, (rp) + lea 16(up), up + ja L(lm1) + + psrlq $32, %mm6 + movd %mm6, 4(rp) + + decl vn + jz L(done) + lea -16(rp), rp + +L(ol1): mov 28(%esp), un + neg un + lea 4(vp), vp + movd (vp), %mm7 C read next V limb + mov 24(%esp), up + lea 24(rp,un,4), rp + + movd (up), %mm0 + pmuludq %mm7, %mm0 + sar $2, un + movd %mm0, %ebx + movd 4(up), %mm1 + pmuludq %mm7, %mm1 + xor %edx, %edx C zero edx and CF + inc un + jmp L(a1) + +L(la1): movd 4(up), %mm1 + adc $0, %edx + add %eax, 12(rp) + movd %mm0, %ebx + pmuludq %mm7, %mm1 + lea 16(rp), rp +L(a1): psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + add %ebx, (rp) + psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + movd %mm0, %ebx + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + add %eax, 4(rp) + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + add %ebx, 8(rp) + psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + pmuludq %mm7, %mm0 + inc un + jnz L(la1) + + adc un, %edx C un is zero here + add %eax, 12(rp) + movd %mm0, %ebx + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %eax + adc un, %eax + add %ebx, 16(rp) + adc un, %eax + mov %eax, 20(rp) + + decl vn + jnz L(ol1) + jmp L(done) + +C ================================================================ + ALIGN(16) +L(lm2): movd -8(up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp + paddq %mm0, %mm6 + movd -4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -8(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd (up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -4(rp) + psrlq $32, %mm6 +L(of2): paddq %mm0, %mm6 + movd 4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, (rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + sub $4, un + movd %mm6, 4(rp) + lea 16(up), up + ja L(lm2) + + psrlq $32, %mm6 + movd %mm6, 8(rp) + + decl vn + jz L(done) + lea -12(rp), rp + +L(ol2): mov 28(%esp), un + neg un + lea 4(vp), vp + movd (vp), %mm7 C read next V limb + mov 24(%esp), up + lea 12(rp,un,4), rp + + movd (up), %mm1 + pmuludq %mm7, %mm1 + sar $2, un + movd 4(up), %mm0 + lea 4(up), up + movd %mm1, %eax + xor %edx, %edx C zero edx and CF + jmp L(lo2) + +L(la2): movd 4(up), %mm1 + adc $0, %edx + add %eax, 12(rp) + movd %mm0, %ebx + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + add %ebx, (rp) + psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + movd %mm0, %ebx + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + add %eax, 4(rp) + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %edx + movd %mm1, %eax + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + add %ebx, 8(rp) +L(lo2): psrlq $32, %mm1 + adc %edx, %eax + movd %mm1, %edx + pmuludq %mm7, %mm0 + inc un + jnz L(la2) + + adc un, %edx C un is zero here + add %eax, 12(rp) + movd %mm0, %ebx + psrlq $32, %mm0 + adc %edx, %ebx + movd %mm0, %eax + adc un, %eax + add %ebx, 16(rp) + adc un, %eax + mov %eax, 20(rp) + + decl vn + jnz L(ol2) +C jmp L(done) + +C ================================================================ +L(done): + emms + pop %ebp + pop %ebx + pop %esi + pop %edi + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm b/gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm new file mode 100644 index 0000000..7847aec --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/popcount.asm @@ -0,0 +1,35 @@ +dnl Intel Atom mpn_popcount -- population count. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm new file mode 100644 index 0000000..af19ed8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sse2/sqr_basecase.asm @@ -0,0 +1,634 @@ +dnl x86 mpn_sqr_basecase -- square an mpn number, optimised for atom. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the +C 4 large loops into one; we could use it for the outer loop branch. +C * Optimise code outside of inner loops. +C * Write combined addmul_1 feed-in a wind-down code, and use when iterating +C outer each loop. ("Overlapping software pipelining") +C * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone +C all pushes. +C * Perhaps write special code for n < M, for some small M. +C * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps +C with even less pipelined code. +C * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left. +C Consider breaking out earlier, saving high the cost of short loops. + +C void mpn_sqr_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xn); + +define(`rp', `%edi') +define(`up', `%esi') +define(`n', `%ecx') + +define(`un', `%ebp') + + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + push %edi + push %esi + mov 12(%esp), rp + mov 16(%esp), up + mov 20(%esp), n + + lea 4(rp), rp C write triangular product starting at rp[1] + dec n + movd (up), %mm7 + + jz L(one) + lea 4(up), up + push %ebx + push %ebp + mov n, %eax + + movd (up), %mm0 + neg n + pmuludq %mm7, %mm0 + pxor %mm6, %mm6 + mov n, un + + and $3, %eax + jz L(of0) + cmp $2, %eax + jc L(of1) + jz L(of2) + +C ================================================================ + jmp L(m3) + ALIGN(16) +L(lm3): movd -4(up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp + paddq %mm0, %mm6 + movd (up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -4(rp) + psrlq $32, %mm6 +L(m3): paddq %mm0, %mm6 + movd 4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, (rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, 4(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + add $4, un + movd %mm6, 8(rp) + lea 16(up), up + js L(lm3) + + psrlq $32, %mm6 + movd %mm6, 12(rp) + + inc n +C jz L(done) + lea -12(up), up + lea 4(rp), rp + jmp L(ol2) + +C ================================================================ + ALIGN(16) +L(lm0): movd (up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp +L(of0): paddq %mm0, %mm6 + movd 4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, (rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, 4(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd 12(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, 8(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + add $4, un + movd %mm6, 12(rp) + lea 16(up), up + js L(lm0) + + psrlq $32, %mm6 + movd %mm6, 16(rp) + + inc n +C jz L(done) + lea -8(up), up + lea 8(rp), rp + jmp L(ol3) + +C ================================================================ + ALIGN(16) +L(lm1): movd -12(up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp + paddq %mm0, %mm6 + movd -8(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -12(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd -4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -8(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd (up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -4(rp) + psrlq $32, %mm6 +L(of1): paddq %mm0, %mm6 + add $4, un + movd %mm6, (rp) + lea 16(up), up + js L(lm1) + + psrlq $32, %mm6 + movd %mm6, 4(rp) + + inc n + jz L(done) C goes away when we add special n=2 code + lea -20(up), up + lea -4(rp), rp + jmp L(ol0) + +C ================================================================ + ALIGN(16) +L(lm2): movd -8(up), %mm0 + pmuludq %mm7, %mm0 + psrlq $32, %mm6 + lea 16(rp), rp + paddq %mm0, %mm6 + movd -4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -8(rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + movd (up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, -4(rp) + psrlq $32, %mm6 +L(of2): paddq %mm0, %mm6 + movd 4(up), %mm0 + pmuludq %mm7, %mm0 + movd %mm6, (rp) + psrlq $32, %mm6 + paddq %mm0, %mm6 + add $4, un + movd %mm6, 4(rp) + lea 16(up), up + js L(lm2) + + psrlq $32, %mm6 + movd %mm6, 8(rp) + + inc n +C jz L(done) + lea -16(up), up +C lea (rp), rp +C jmp L(ol1) + +C ================================================================ + +L(ol1): lea 4(up,n,4), up + movd (up), %mm7 C read next U invariant limb + lea 8(rp,n,4), rp + mov n, un + + movd 4(up), %mm1 + pmuludq %mm7, %mm1 + sar $2, un + movd %mm1, %ebx + inc un + jz L(re1) + + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + xor %edx, %edx C zero edx and CF + jmp L(a1) + +L(la1): adc $0, %edx + add %ebx, 12(rp) + movd %mm0, %eax + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + add %eax, (rp) +L(a1): psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %edx + movd %mm0, %eax + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + add %ebx, 4(rp) + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + add %eax, 8(rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %edx + pmuludq %mm7, %mm0 + inc un + movd 4(up), %mm1 + jnz L(la1) + + adc un, %edx C un is zero here + add %ebx, 12(rp) + movd %mm0, %eax + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + adc un, %edx + add %eax, (rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %eax + adc un, %eax + add %ebx, 4(rp) + adc un, %eax + mov %eax, 8(rp) + + inc n + +C ================================================================ + +L(ol0): lea (up,n,4), up + movd 4(up), %mm7 C read next U invariant limb + lea 4(rp,n,4), rp + mov n, un + + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + sar $2, un + movd 12(up), %mm1 + movd %mm0, %eax + pmuludq %mm7, %mm1 + xor %edx, %edx C zero edx and CF + jmp L(a0) + +L(la0): adc $0, %edx + add %ebx, 12(rp) + movd %mm0, %eax + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + add %eax, (rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %edx + movd %mm0, %eax + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + add %ebx, 4(rp) +L(a0): psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + add %eax, 8(rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %edx + pmuludq %mm7, %mm0 + inc un + movd 4(up), %mm1 + jnz L(la0) + + adc un, %edx C un is zero here + add %ebx, 12(rp) + movd %mm0, %eax + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + adc un, %edx + add %eax, (rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %eax + adc un, %eax + add %ebx, 4(rp) + adc un, %eax + mov %eax, 8(rp) + + inc n + +C ================================================================ + +L(ol3): lea 12(up,n,4), up + movd -8(up), %mm7 C read next U invariant limb + lea (rp,n,4), rp C put rp back + mov n, un + + movd -4(up), %mm1 + pmuludq %mm7, %mm1 + sar $2, un + movd %mm1, %ebx + movd (up), %mm0 + xor %edx, %edx C zero edx and CF + jmp L(a3) + +L(la3): adc $0, %edx + add %ebx, 12(rp) + movd %mm0, %eax + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + add %eax, (rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %edx + movd %mm0, %eax + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + add %ebx, 4(rp) + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + add %eax, 8(rp) +L(a3): psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %edx + pmuludq %mm7, %mm0 + inc un + movd 4(up), %mm1 + jnz L(la3) + + adc un, %edx C un is zero here + add %ebx, 12(rp) + movd %mm0, %eax + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + adc un, %edx + add %eax, (rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %eax + adc un, %eax + add %ebx, 4(rp) + adc un, %eax + mov %eax, 8(rp) + + inc n + +C ================================================================ + +L(ol2): lea 8(up,n,4), up + movd -4(up), %mm7 C read next U invariant limb + lea 12(rp,n,4), rp + mov n, un + + movd (up), %mm0 + pmuludq %mm7, %mm0 + xor %edx, %edx + sar $2, un + movd 4(up), %mm1 + test un, un C clear carry + movd %mm0, %eax + pmuludq %mm7, %mm1 + inc un + jnz L(a2) + jmp L(re2) + +L(la2): adc $0, %edx + add %ebx, 12(rp) + movd %mm0, %eax + pmuludq %mm7, %mm1 + lea 16(rp), rp +L(a2): psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + movd 8(up), %mm0 + pmuludq %mm7, %mm0 + adc $0, %edx + add %eax, (rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %edx + movd %mm0, %eax + movd 12(up), %mm1 + pmuludq %mm7, %mm1 + adc $0, %edx + add %ebx, 4(rp) + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + lea 16(up), up + movd (up), %mm0 + adc $0, %edx + add %eax, 8(rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %edx + pmuludq %mm7, %mm0 + inc un + movd 4(up), %mm1 + jnz L(la2) + + adc un, %edx C un is zero here + add %ebx, 12(rp) + movd %mm0, %eax + pmuludq %mm7, %mm1 + lea 16(rp), rp + psrlq $32, %mm0 + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + adc un, %edx + add %eax, (rp) + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %eax + adc un, %eax + add %ebx, 4(rp) + adc un, %eax + mov %eax, 8(rp) + + inc n + jmp L(ol1) + +C ================================================================ +L(re2): psrlq $32, %mm0 + movd (up), %mm7 C read next U invariant limb + adc %edx, %eax + movd %mm0, %edx + movd %mm1, %ebx + adc un, %edx + add %eax, (rp) + lea 4(rp), rp + psrlq $32, %mm1 + adc %edx, %ebx + movd %mm1, %eax + movd 4(up), %mm1 + adc un, %eax + add %ebx, (rp) + pmuludq %mm7, %mm1 + adc un, %eax + mov %eax, 4(rp) + movd %mm1, %ebx + +L(re1): psrlq $32, %mm1 + add %ebx, 4(rp) + movd %mm1, %eax + adc un, %eax + xor n, n C make n zeroness assumption below true + mov %eax, 8(rp) + +L(done): C n is zero here + mov 24(%esp), up + mov 28(%esp), %eax + + movd (up), %mm0 + inc %eax + pmuludq %mm0, %mm0 + lea 4(up), up + mov 20(%esp), rp + shr %eax + movd %mm0, (rp) + psrlq $32, %mm0 + lea -12(rp), rp + mov %eax, 28(%esp) + jnc L(odd) + + movd %mm0, %ebp + movd (up), %mm0 + lea 8(rp), rp + pmuludq %mm0, %mm0 + lea -4(up), up + add 8(rp), %ebp + movd %mm0, %edx + adc 12(rp), %edx + rcr n + jmp L(ent) + +C ALIGN(16) C alignment seems irrelevant +L(top): movd (up), %mm1 + adc n, n + movd %mm0, %eax + pmuludq %mm1, %mm1 + movd 4(up), %mm0 + adc (rp), %eax + movd %mm1, %ebx + pmuludq %mm0, %mm0 + psrlq $32, %mm1 + adc 4(rp), %ebx + movd %mm1, %ebp + movd %mm0, %edx + adc 8(rp), %ebp + adc 12(rp), %edx + rcr n C FIXME: isn't this awfully slow on atom??? + adc %eax, (rp) + adc %ebx, 4(rp) +L(ent): lea 8(up), up + adc %ebp, 8(rp) + psrlq $32, %mm0 + adc %edx, 12(rp) +L(odd): decl 28(%esp) + lea 16(rp), rp + jnz L(top) + +L(end): adc n, n + movd %mm0, %eax + adc n, %eax + mov %eax, (rp) + +L(rtn): emms + pop %ebp + pop %ebx + pop %esi + pop %edi + ret + +L(one): pmuludq %mm7, %mm7 + movq %mm7, -4(rp) + emms + pop %esi + pop %edi + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm b/gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm new file mode 100644 index 0000000..d3e7e5b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sublsh1_n.asm @@ -0,0 +1,34 @@ +dnl Intel Atom mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1) +include_mpn(`x86/k7/sublsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm b/gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm new file mode 100644 index 0000000..79405cf --- /dev/null +++ b/gmp-6.3.0/mpn/x86/atom/sublsh2_n.asm @@ -0,0 +1,57 @@ +dnl Intel Atom mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2). + +dnl Contributed to the GNU project by Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 30) + +ifdef(`OPERATION_addlsh2_n', ` + define(M4_inst, adcl) + define(M4_opp, subl) + define(M4_function, mpn_addlsh2_n) + define(M4_function_c, mpn_addlsh2_nc) + define(M4_ip_function_c, mpn_addlsh2_nc_ip1) + define(M4_ip_function, mpn_addlsh2_n_ip1) +',`ifdef(`OPERATION_sublsh2_n', ` + define(M4_inst, sbbl) + define(M4_opp, addl) + define(M4_function, mpn_sublsh2_n) + define(M4_function_c, mpn_sublsh2_nc) + define(M4_ip_function_c, mpn_sublsh2_nc_ip1) + define(M4_ip_function, mpn_sublsh2_n_ip1) +',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_sublsh2_n +')')') + +MULFUNC_PROLOGUE(mpn_sublsh2_n mpn_sublsh2_nc mpn_sublsh2_n_ip1 mpn_sublsh2_nc_ip1) + +include_mpn(`x86/atom/aorslshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h b/gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h new file mode 100644 index 0000000..254cfea --- /dev/null +++ b/gmp-6.3.0/mpn/x86/bd1/gmp-mparam.h @@ -0,0 +1,211 @@ +/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3600-3800 MHz Bulldozer Zambezi */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-27, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 59.59% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 245 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 89 +#define MUL_TOOM44_THRESHOLD 154 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 351 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 101 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 111 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 46 +#define SQR_TOOM3_THRESHOLD 87 +#define SQR_TOOM4_THRESHOLD 216 +#define SQR_TOOM6_THRESHOLD 294 +#define SQR_TOOM8_THRESHOLD 442 + +#define MULMID_TOOM42_THRESHOLD 50 + +#define MULMOD_BNM1_THRESHOLD 22 +#define SQRMOD_BNM1_THRESHOLD 26 + +#define MUL_FFT_MODF_THRESHOLD 636 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 636, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,11}, { 63, 7}, { 1023, 8}, { 543,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335,11}, { 191,10}, \ + { 399,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 639,12}, { 191,11}, { 383,10}, { 799,11}, \ + { 415,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,12}, { 447,11}, { 895,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1471,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1727,12}, { 895,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,10}, { 4479,12}, \ + { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2239,11}, \ + { 4479,13}, { 1151,12}, { 2495,11}, { 4991,13}, \ + { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,12}, { 4991,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3327,13}, \ + { 6911,14}, { 3839,13}, { 7935,16} } +#define MUL_FFT_TABLE3_SIZE 159 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 565, 5}, { 29, 6}, { 15, 5}, { 32, 6}, \ + { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95,11}, { 63,10}, { 159,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,11}, { 191,10}, { 415,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,12}, { 383,11}, { 863,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1727,12}, { 895,11}, \ + { 1791,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2239,10}, { 4479,12}, { 1215,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \ + { 2495,11}, { 4991,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,12}, { 4991,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3327,13}, { 6783,14}, { 3839,13}, { 7679,16} } +#define SQR_FFT_TABLE3_SIZE 152 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 31 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 33 +#define SQRLO_SQR_THRESHOLD 11278 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 198 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 126 + +#define INV_MULMOD_BNM1_THRESHOLD 82 +#define INV_NEWTON_THRESHOLD 212 +#define INV_APPR_THRESHOLD 202 + +#define BINV_NEWTON_THRESHOLD 238 +#define REDC_1_TO_REDC_N_THRESHOLD 55 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 110 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1528 + +#define POWM_SEC_TABLE 1,20,96,386,1221,2698 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 100 +#define SET_STR_PRECOMPUTE_THRESHOLD 762 + +#define FAC_DSC_THRESHOLD 118 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 4 /* 1.22% faster than 3 */ +#define HGCD_THRESHOLD 67 +#define HGCD_APPR_THRESHOLD 150 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 483 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 4 /* 5.07% faster than 1 */ + +/* Tuneup completed successfully, took 65358 seconds */ diff --git a/gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h b/gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h new file mode 100644 index 0000000..6893da7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/bd2/gmp-mparam.h @@ -0,0 +1,214 @@ +/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 4000-4200 MHz Piledriver Vishera */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 40.87% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 24 + +#define DIV_1_VS_MUL_1_PERCENT 254 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 151 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 351 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 100 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 110 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 44 +#define SQR_TOOM3_THRESHOLD 93 +#define SQR_TOOM4_THRESHOLD 212 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 466 + +#define MULMID_TOOM42_THRESHOLD 66 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 595 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 595, 5}, { 27, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 143, 7}, { 1215, 9}, \ + { 319, 8}, { 639, 9}, { 335, 8}, { 671, 9}, \ + { 351,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 271,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335,11}, { 191,10}, { 399,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 671,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,12}, { 447,11}, { 895,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2239,12}, { 1215,13}, { 639,12}, \ + { 1471,11}, { 2943,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2431,13}, { 1279,12}, \ + { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \ + { 1535,12}, { 3135,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,12}, { 7935,11}, { 15871,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,13}, { 7935,12}, { 15871,16} } +#define MUL_FFT_TABLE3_SIZE 155 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 555 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 555, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,10}, { 191, 6}, { 3071, 5}, { 6399, 6}, \ + { 3455, 7}, { 1791, 8}, { 959,10}, { 255, 9}, \ + { 511,10}, { 271,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 399, 9}, { 799,10}, { 415,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 799,11}, \ + { 415,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,12}, { 447,11}, { 927,13}, \ + { 255,12}, { 511,11}, { 1055,10}, { 2111,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2111,12}, { 1087,11}, { 2239,10}, { 4479,12}, \ + { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1855,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,12}, { 7935,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,13}, { 7935,16} } +#define SQR_FFT_TABLE3_SIZE 166 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 34 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 43 +#define SQRLO_SQR_THRESHOLD 11278 + +#define DC_DIV_QR_THRESHOLD 75 +#define DC_DIVAPPR_Q_THRESHOLD 200 +#define DC_BDIV_QR_THRESHOLD 71 +#define DC_BDIV_Q_THRESHOLD 119 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 266 +#define INV_APPR_THRESHOLD 214 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_N_THRESHOLD 71 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1597 + +#define POWM_SEC_TABLE 1,22,96,289,1259 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 173 +#define SET_STR_PRECOMPUTE_THRESHOLD 454 + +#define FAC_DSC_THRESHOLD 90 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 5.80% faster than 3 */ +#define HGCD_THRESHOLD 74 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 456 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 4 /* 17.07% faster than 1 */ + +/* Tuneup completed successfully, took 53914 seconds */ diff --git a/gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h b/gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h new file mode 100644 index 0000000..6c20d0f --- /dev/null +++ b/gmp-6.3.0/mpn/x86/bd4/gmp-mparam.h @@ -0,0 +1,225 @@ +/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3800-4200 MHz Excavator/Bristol Ridge */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 27 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 28.45% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 13 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 28 + +#define DIV_1_VS_MUL_1_PERCENT 314 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 103 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 121 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 42 +#define SQR_TOOM3_THRESHOLD 89 +#define SQR_TOOM4_THRESHOLD 208 +#define SQR_TOOM6_THRESHOLD 306 +#define SQR_TOOM8_THRESHOLD 454 + +#define MULMID_TOOM42_THRESHOLD 68 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 570, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 143, 6}, { 2303, 5}, \ + { 4735, 4}, { 9471, 5}, { 4863, 7}, { 1279, 9}, \ + { 335, 8}, { 671, 9}, { 351, 8}, { 703,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671, 8}, \ + { 1343,10}, { 351, 9}, { 703,10}, { 367, 9}, \ + { 735,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799, 8}, { 1599,10}, { 415,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671, 9}, { 1343,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215, 9}, { 2431,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471, 9}, { 2943,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 959,10}, { 1919,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1471,10}, \ + { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1727,10}, { 3455,12}, { 959,11}, \ + { 1919,10}, { 3839,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \ + { 1471,11}, { 2943,10}, { 5887,13}, { 767,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \ + { 1407,12}, { 2943,11}, { 5887,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,12}, \ + { 7935,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,13}, { 7935,16} } +#define MUL_FFT_TABLE3_SIZE 192 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 476, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 735,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415, 9}, { 863,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671, 9}, { 1343,11}, { 351,10}, { 735,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1055,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2239,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2559,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 176 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 54 +#define MULLO_MUL_N_THRESHOLD 10950 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 77 +#define SQRLO_SQR_THRESHOLD 9449 + +#define DC_DIV_QR_THRESHOLD 84 +#define DC_DIVAPPR_Q_THRESHOLD 252 +#define DC_BDIV_QR_THRESHOLD 79 +#define DC_BDIV_Q_THRESHOLD 80 + +#define INV_MULMOD_BNM1_THRESHOLD 71 +#define INV_NEWTON_THRESHOLD 254 +#define INV_APPR_THRESHOLD 266 + +#define BINV_NEWTON_THRESHOLD 294 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1387 +#define MU_BDIV_Q_THRESHOLD 1528 + +#define POWM_SEC_TABLE 1,16,96,480,960 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 264 +#define SET_STR_PRECOMPUTE_THRESHOLD 542 + +#define FAC_DSC_THRESHOLD 91 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 9.73% faster than 3 */ +#define HGCD_THRESHOLD 55 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 562 +#define GCDEXT_DC_THRESHOLD 416 +#define JACOBI_BASE_METHOD 4 /* 16.50% faster than 1 */ + +/* Tuneup completed successfully, took 49179 seconds */ diff --git a/gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm new file mode 100644 index 0000000..0288c47 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/bdiv_dbm1c.asm @@ -0,0 +1,129 @@ +dnl x86 mpn_bdiv_dbm1. + +dnl Copyright 2008, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 +C P6 model 0-8,10-12) +C P6 model 9 (Banias) +C P6 model 13 (Dothan) 5.1 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) 13.67 +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom +C AMD K6 +C AMD K7 3.5 +C AMD K8 +C AMD K10 + + +C TODO +C * Optimize for more x86 processors + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_dbm1c) + mov 16(%esp), %ecx C d + push %esi + mov 12(%esp), %esi C ap + push %edi + mov 12(%esp), %edi C qp + push %ebp + mov 24(%esp), %ebp C n + push %ebx + + mov (%esi), %eax + mul %ecx + mov 36(%esp), %ebx + sub %eax, %ebx + mov %ebx, (%edi) + sbb %edx, %ebx + + mov %ebp, %eax + and $3, %eax + jz L(b0) + cmp $2, %eax + jc L(b1) + jz L(b2) + +L(b3): lea -8(%esi), %esi + lea 8(%edi), %edi + add $-3, %ebp + jmp L(3) + +L(b0): mov 4(%esi), %eax + lea -4(%esi), %esi + lea 12(%edi), %edi + add $-4, %ebp + jmp L(0) + +L(b2): mov 4(%esi), %eax + lea 4(%esi), %esi + lea 4(%edi), %edi + add $-2, %ebp + jmp L(2) + + ALIGN(8) +L(top): mov 4(%esi), %eax + mul %ecx + lea 16(%edi), %edi + sub %eax, %ebx + mov 8(%esi), %eax + mov %ebx, -12(%edi) + sbb %edx, %ebx +L(0): mul %ecx + sub %eax, %ebx + mov %ebx, -8(%edi) + sbb %edx, %ebx +L(3): mov 12(%esi), %eax + mul %ecx + sub %eax, %ebx + mov %ebx, -4(%edi) + mov 16(%esi), %eax + lea 16(%esi), %esi + sbb %edx, %ebx +L(2): mul %ecx + sub %eax, %ebx + mov %ebx, 0(%edi) + sbb %edx, %ebx +L(b1): add $-4, %ebp + jns L(top) + + mov %ebx, %eax + pop %ebx + pop %ebp + pop %edi + pop %esi + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/bdiv_q_1.asm new file mode 100644 index 0000000..132de06 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/bdiv_q_1.asm @@ -0,0 +1,208 @@ +dnl x86 mpn_bdiv_q_1 -- mpn by limb exact division. + +dnl Rearranged from mpn/x86/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P54 30.0 +C P55 29.0 +C P6 13.0 odd divisor, 12.0 even (strangely) +C K6 14.0 +C K7 12.0 +C P4 42.0 + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_SRC') + + TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + + ALIGN(16) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_SHIFT, %ecx + pushl %ebp FRAME_pushl() + + movl PARAM_INVERSE, %eax + movl PARAM_SIZE, %ebp + pushl %ebx FRAME_pushl() +L(common): + pushl %edi FRAME_pushl() + pushl %esi FRAME_pushl() + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + + leal (%esi,%ebp,4), %esi C src end + leal (%edi,%ebp,4), %edi C dst end + negl %ebp C -size + + movl %eax, VAR_INVERSE + movl (%esi,%ebp,4), %eax C src[0] + + xorl %ebx, %ebx + xorl %edx, %edx + + incl %ebp + jz L(one) + + movl (%esi,%ebp,4), %edx C src[1] + + shrdl( %cl, %edx, %eax) + + movl VAR_INVERSE, %edx + jmp L(entry) + + + ALIGN(8) + nop C k6 code alignment + nop +L(top): + C eax q + C ebx carry bit, 0 or -1 + C ecx shift + C edx carry limb + C esi src end + C edi dst end + C ebp counter, limbs, negative + + movl -4(%esi,%ebp,4), %eax + subl %ebx, %edx C accumulate carry bit + + movl (%esi,%ebp,4), %ebx + + shrdl( %cl, %ebx, %eax) + + subl %edx, %eax C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebx, %ebx + +L(entry): + imull %edx, %eax + + movl %eax, -4(%edi,%ebp,4) + movl PARAM_DIVISOR, %edx + + mull %edx + + incl %ebp + jnz L(top) + + + movl -4(%esi), %eax C src high limb +L(one): + shrl %cl, %eax + popl %esi FRAME_popl() + + addl %ebx, %eax C apply carry bit + + subl %edx, %eax C apply carry limb + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + + popl %edi + popl %ebx + popl %ebp + + ret + +EPILOGUE() + +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C + + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + pushl %ebp FRAME_pushl() + + movl $-1, %ecx C shift count + movl PARAM_SIZE, %ebp + + pushl %ebx FRAME_pushl() + +L(strip_twos): + incl %ecx + + shrl %eax + jnc L(strip_twos) + + leal 1(%eax,%eax), %ebx C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %edx) + movzbl (%eax,%edx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + leal (%eax,%eax), %edx C 2*inv + movl %ebx, PARAM_DIVISOR C d without twos + imull %eax, %eax C inv*inv + imull %ebx, %eax C inv*inv*d + subl %eax, %edx C inv = 2*inv - inv*inv*d + + leal (%edx,%edx), %eax C 2*inv + imull %edx, %edx C inv*inv + imull %ebx, %edx C inv*inv*d + subl %edx, %eax C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + jmp L(common) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h b/gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h new file mode 100644 index 0000000..302dbc6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/bt1/gmp-mparam.h @@ -0,0 +1,218 @@ +/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than + the value in mpn/x86/k7/gmp-mparam.h. The latter is used as a hard limit in + k7/sqr_basecase.asm. */ + +/* 1600 MHz AMD Bobcat Zacate E-350 */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 57.16% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 36 + +#define DIV_1_VS_MUL_1_PERCENT 199 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 102 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 177 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 169 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 143 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 50 +#define SQR_TOOM3_THRESHOLD 89 +#define SQR_TOOM4_THRESHOLD 248 +#define SQR_TOOM6_THRESHOLD 342 +#define SQR_TOOM8_THRESHOLD 470 + +#define MULMID_TOOM42_THRESHOLD 72 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 21 + +#define MUL_FFT_MODF_THRESHOLD 630 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 630, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \ + { 23, 8}, { 55, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 6}, \ + { 767, 7}, { 399, 6}, { 799, 7}, { 415, 8}, \ + { 235, 7}, { 479, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,12}, { 447,11}, { 991,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 159 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 500, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 127, 6}, { 1087, 7}, { 575, 8}, { 303, 9}, \ + { 159,10}, { 95,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415, 9}, { 831,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3839,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 161 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 9 +#define MULLO_DC_THRESHOLD 48 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 7 +#define SQRLO_DC_THRESHOLD 146 +#define SQRLO_SQR_THRESHOLD 11278 + +#define DC_DIV_QR_THRESHOLD 77 +#define DC_DIVAPPR_Q_THRESHOLD 240 +#define DC_BDIV_QR_THRESHOLD 83 +#define DC_BDIV_Q_THRESHOLD 182 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 252 + +#define BINV_NEWTON_THRESHOLD 252 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1787 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1713 + +#define POWM_SEC_TABLE 1,16,96,563,1317,1867 + +#define GET_STR_DC_THRESHOLD 19 +#define GET_STR_PRECOMPUTE_THRESHOLD 32 +#define SET_STR_DC_THRESHOLD 254 +#define SET_STR_PRECOMPUTE_THRESHOLD 907 + +#define FAC_DSC_THRESHOLD 224 +#define FAC_ODD_THRESHOLD 55 + +#define MATRIX22_STRASSEN_THRESHOLD 23 +#define HGCD2_DIV1_METHOD 3 /* 3.59% faster than 5 */ +#define HGCD_THRESHOLD 85 +#define HGCD_APPR_THRESHOLD 152 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 531 +#define GCDEXT_DC_THRESHOLD 386 +#define JACOBI_BASE_METHOD 3 /* 0.92% faster than 1 */ + +/* Tuneup completed successfully, took 159946 seconds */ diff --git a/gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h b/gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h new file mode 100644 index 0000000..f936cb7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/bt2/gmp-mparam.h @@ -0,0 +1,214 @@ +/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than + the value in mpn/x86/k7/gmp-mparam.h. The latter is used as a hard limit in + k7/sqr_basecase.asm. */ + +/* 2050 MHz AMD Jaguar/Kabini */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 6 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 47.53% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 243 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 90 +#define MUL_TOOM44_THRESHOLD 154 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 152 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 103 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 38 +#define SQR_TOOM3_THRESHOLD 126 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 502 + +#define MULMID_TOOM42_THRESHOLD 68 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 25 + +#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 570, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 991,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 153 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 530, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 95,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 991,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2495,13}, \ + { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 151 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 8 +#define MULLO_DC_THRESHOLD 44 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 13 +#define SQRLO_DC_THRESHOLD 62 +#define SQRLO_SQR_THRESHOLD 8907 + +#define DC_DIV_QR_THRESHOLD 79 +#define DC_DIVAPPR_Q_THRESHOLD 228 +#define DC_BDIV_QR_THRESHOLD 75 +#define DC_BDIV_Q_THRESHOLD 136 + +#define INV_MULMOD_BNM1_THRESHOLD 90 +#define INV_NEWTON_THRESHOLD 260 +#define INV_APPR_THRESHOLD 236 + +#define BINV_NEWTON_THRESHOLD 294 +#define REDC_1_TO_REDC_N_THRESHOLD 80 + +#define MU_DIV_QR_THRESHOLD 1787 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 118 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1652 + +#define POWM_SEC_TABLE 1,16,96,615,865,1442 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 252 +#define SET_STR_PRECOMPUTE_THRESHOLD 638 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 39 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 13.65% faster than 3 */ +#define HGCD_THRESHOLD 81 +#define HGCD_APPR_THRESHOLD 66 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 531 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 1 /* 0.84% faster than 4 */ + +/* Tuneup completed successfully, took 103818 seconds */ diff --git a/gmp-6.3.0/mpn/x86/cnd_aors_n.asm b/gmp-6.3.0/mpn/x86/cnd_aors_n.asm new file mode 100644 index 0000000..74f4917 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/cnd_aors_n.asm @@ -0,0 +1,124 @@ +dnl X86 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 5.4 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 14.5 +C P4 model 3-4 (Prescott) 21 +C Intel atom 11 +C AMD K6 ? +C AMD K7 3.4 +C AMD K8 ? + + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebp') +define(`n', `%ecx') +define(`cnd', `20(%esp)') +define(`cy', `%edx') + +ifdef(`OPERATION_cnd_add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + add $-16, %esp + mov %ebp, (%esp) + mov %ebx, 4(%esp) + mov %esi, 8(%esp) + mov %edi, 12(%esp) + + C make cnd into a full mask + mov cnd, %eax + neg %eax + sbb %eax, %eax + mov %eax, cnd + + C load parameters into registers + mov 24(%esp), rp + mov 28(%esp), up + mov 32(%esp), vp + mov 36(%esp), n + + mov (vp), %eax + mov (up), %ebx + + C put operand pointers just beyond their last limb + lea (vp,n,4), vp + lea (up,n,4), up + lea -4(rp,n,4), rp + neg n + + and cnd, %eax + ADDSUB %eax, %ebx + sbb cy, cy + inc n + je L(end) + + ALIGN(16) +L(top): mov (vp,n,4), %eax + and cnd, %eax + mov %ebx, (rp,n,4) + mov (up,n,4), %ebx + add cy, cy + ADCSBB %eax, %ebx + sbb cy, cy + inc n + jne L(top) + +L(end): mov %ebx, (rp) + xor %eax, %eax + sub cy, %eax + + mov (%esp), %ebp + mov 4(%esp), %ebx + mov 8(%esp), %esi + mov 12(%esp), %edi + add $16, %esp + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/copyd.asm b/gmp-6.3.0/mpn/x86/copyd.asm new file mode 100644 index 0000000..51fa195 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/copyd.asm @@ -0,0 +1,91 @@ +dnl x86 mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb startup (approx) +C P5 1.0 40 +C P6 2.4 70 +C K6 1.0 55 +C K7 1.3 75 +C P4 2.6 175 +C +C (Startup time includes some function call overheads.) + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size, working from high to low addresses. +C +C The code here is very generic and can be expected to be reasonable on all +C the x86 family. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_copyd) + C eax saved esi + C ebx + C ecx counter + C edx saved edi + C esi src + C edi dst + C ebp + + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + movl PARAM_DST, %edi + leal -4(%esi,%ecx,4), %esi + + leal -4(%edi,%ecx,4), %edi + + std + + rep + movsl + + cld + + movl %eax, %esi + movl %edx, %edi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/copyi.asm b/gmp-6.3.0/mpn/x86/copyi.asm new file mode 100644 index 0000000..f6b0354 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/copyi.asm @@ -0,0 +1,99 @@ +dnl x86 mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb startup (approx) +C P5 1.0 35 +C P6 0.75 45 +C K6 1.0 30 +C K7 1.3 65 +C P4 1.0 120 +C +C (Startup time includes some function call overheads.) + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size, working from low to high addresses. +C +C The code here is very generic and can be expected to be reasonable on all +C the x86 family. +C +C P6 - An MMX based copy was tried, but was found to be slower than a rep +C movs in all cases. The fastest MMX found was 0.8 cycles/limb (when +C fully aligned). A rep movs seems to have a startup time of about 15 +C cycles, but doing something special for small sizes could lead to a +C branch misprediction that would destroy any saving. For now a plain +C rep movs seems ok. +C +C K62 - We used to have a big chunk of code doing an MMX copy at 0.56 c/l if +C aligned or a 1.0 rep movs if not. But that seemed excessive since +C it only got an advantage half the time, and even then only showed it +C above 50 limbs or so. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + TEXT + ALIGN(32) + + C eax saved esi + C ebx + C ecx counter + C edx saved edi + C esi src + C edi dst + C ebp + +PROLOGUE(mpn_copyi) + + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + movl PARAM_DST, %edi + + cld C better safe than sorry, see mpn/x86/README + + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/core2/gmp-mparam.h b/gmp-6.3.0/mpn/x86/core2/gmp-mparam.h new file mode 100644 index 0000000..8a44ad1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/core2/gmp-mparam.h @@ -0,0 +1,210 @@ +/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3000 MHz Penryn */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 2 /* 22.20% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 9 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 18 + +#define DIV_1_VS_MUL_1_PERCENT 277 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 136 +#define MUL_TOOM6H_THRESHOLD 300 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 91 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 93 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 184 +#define SQR_TOOM6_THRESHOLD 262 +#define SQR_TOOM8_THRESHOLD 597 + +#define MULMID_TOOM42_THRESHOLD 70 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 25 + +#define MUL_FFT_MODF_THRESHOLD 505 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 505, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63, 9}, { 255,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 335, 9}, \ + { 671,10}, { 351,11}, { 191,10}, { 399, 9}, \ + { 799,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,12}, \ + { 7935,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 147 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 464 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 464, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 127,10}, { 79, 9}, { 159,10}, \ + { 95,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 5}, { 4863, 6}, { 2495, 7}, \ + { 1343, 8}, { 703, 9}, { 367,12}, { 63,11}, \ + { 127,10}, { 303,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,10}, { 367,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 863,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,12}, { 447,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2431,13}, { 1407,12}, { 2943,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 157 +#define SQR_FFT_THRESHOLD 5312 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 36 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 140 +#define SQRLO_SQR_THRESHOLD 10393 + +#define DC_DIV_QR_THRESHOLD 32 +#define DC_DIVAPPR_Q_THRESHOLD 116 +#define DC_BDIV_QR_THRESHOLD 76 +#define DC_BDIV_Q_THRESHOLD 180 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 138 +#define INV_APPR_THRESHOLD 123 + +#define BINV_NEWTON_THRESHOLD 306 +#define REDC_1_TO_REDC_N_THRESHOLD 82 + +#define MU_DIV_QR_THRESHOLD 1499 +#define MU_DIVAPPR_Q_THRESHOLD 1442 +#define MUPI_DIV_QR_THRESHOLD 63 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1589 + +#define POWM_SEC_TABLE 1,22,66,428,1035 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 732 +#define SET_STR_PRECOMPUTE_THRESHOLD 1118 + +#define FAC_DSC_THRESHOLD 115 +#define FAC_ODD_THRESHOLD 50 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 1 /* 5.78% faster than 3 */ +#define HGCD_THRESHOLD 121 +#define HGCD_APPR_THRESHOLD 151 +#define HGCD_REDUCE_THRESHOLD 3259 +#define GCD_DC_THRESHOLD 368 +#define GCDEXT_DC_THRESHOLD 306 +#define JACOBI_BASE_METHOD 4 /* 14.19% faster than 1 */ + +/* Tuneup completed successfully, took 67142 seconds */ diff --git a/gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h new file mode 100644 index 0000000..7b58cad --- /dev/null +++ b/gmp-6.3.0/mpn/x86/coreibwl/gmp-mparam.h @@ -0,0 +1,216 @@ +/* x86/coreibwl gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 15 +#define MOD_1_UNNORM_THRESHOLD 16 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 21.34% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 14 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 29 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define DIV_1_VS_MUL_1_PERCENT 295 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 97 +#define MUL_TOOM44_THRESHOLD 220 +#define MUL_TOOM6H_THRESHOLD 306 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 169 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 44 +#define SQR_TOOM3_THRESHOLD 134 +#define SQR_TOOM4_THRESHOLD 242 +#define SQR_TOOM6_THRESHOLD 342 +#define SQR_TOOM8_THRESHOLD 502 + +#define MULMID_TOOM42_THRESHOLD 98 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 540 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 540, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 17, 5}, { 36, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 43, 9}, { 23, 8}, { 55,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95, 7}, { 1599, 8}, { 831, 9}, { 431, 8}, \ + { 863, 9}, { 447,10}, { 239, 9}, { 479,10}, \ + { 255, 9}, { 511,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 511, 9}, \ + { 1023,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1119,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1119,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3839,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2559,13}, { 5247,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 172 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 472 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 472, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 37, 7}, { 19, 6}, { 40, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 271, 9}, { 543, 6}, { 4479, 7}, { 2431, 8}, \ + { 1247, 7}, { 2495, 8}, { 1279,10}, { 351,11}, \ + { 191,10}, { 399, 9}, { 799,10}, { 415,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 927,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1663,12}, \ + { 895,11}, { 1855,14}, { 255,13}, { 511,12}, \ + { 1023,11}, { 2047,12}, { 1087,11}, { 2239,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1663,13}, { 895,12}, { 1983,14}, { 511,13}, \ + { 1023,12}, { 2239,13}, { 1151,12}, { 2495,13}, \ + { 1279,12}, { 2623,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3135,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3839,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3327,13}, { 6783,14}, \ + { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 157 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 16 +#define MULLO_DC_THRESHOLD 37 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 137 +#define SQRLO_SQR_THRESHOLD 10821 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 146 +#define DC_BDIV_QR_THRESHOLD 98 +#define DC_BDIV_Q_THRESHOLD 218 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 173 +#define INV_APPR_THRESHOLD 165 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1787 +#define MU_DIVAPPR_Q_THRESHOLD 1787 +#define MUPI_DIV_QR_THRESHOLD 78 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1830 + +#define POWM_SEC_TABLE 1,16,126,416,932 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 306 +#define SET_STR_PRECOMPUTE_THRESHOLD 894 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 3 /* 5.97% faster than 1 */ +#define HGCD_THRESHOLD 73 +#define HGCD_APPR_THRESHOLD 123 +#define HGCD_REDUCE_THRESHOLD 3664 +#define GCD_DC_THRESHOLD 562 +#define GCDEXT_DC_THRESHOLD 465 +#define JACOBI_BASE_METHOD 1 /* 31.16% faster than 3 */ + +/* Tuneup completed successfully, took 35114 seconds */ diff --git a/gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h new file mode 100644 index 0000000..4c1b388 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/coreihwl/gmp-mparam.h @@ -0,0 +1,216 @@ +/* x86/coreihwl gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3600-4000 MHz Intel Xeon E3-1271v3 Haswell */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 17 +#define MOD_1_UNNORM_THRESHOLD 17 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 +#define USE_PREINV_DIVREM_1 1 /* native */ +/* From sky.gmplib.org, 2023-07-20 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 5.86% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 13 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define DIV_1_VS_MUL_1_PERCENT 296 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 108 +#define MUL_TOOM44_THRESHOLD 232 +#define MUL_TOOM6H_THRESHOLD 306 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 109 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 113 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 44 +#define SQR_TOOM3_THRESHOLD 141 +#define SQR_TOOM4_THRESHOLD 384 +#define SQR_TOOM6_THRESHOLD 517 +#define SQR_TOOM8_THRESHOLD 698 + +#define MULMID_TOOM42_THRESHOLD 98 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 565, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \ + { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 43, 9}, { 23, 8}, { 55, 9}, \ + { 31, 8}, { 71, 9}, { 39, 8}, { 83, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191, 6}, { 3199, 7}, { 1727, 9}, { 447,10}, \ + { 239, 9}, { 479,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \ + { 191,10}, { 399, 9}, { 799,10}, { 415,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 511, 9}, \ + { 1023,10}, { 527,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,11}, \ + { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,12}, { 447,11}, { 991,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,12}, { 959,11}, { 1919,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,12}, { 1215,13}, \ + { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2431,13}, { 1279,12}, \ + { 2623,13}, { 1407,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,14}, { 2559,13}, \ + { 5375,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 165 +#define MUL_FFT_THRESHOLD 7808 + +#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 560, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 17, 5}, { 36, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \ + { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,12}, { 63,11}, { 127, 9}, { 511, 5}, \ + { 8959, 7}, { 2431, 8}, { 1247, 7}, { 2495, 8}, \ + { 1279, 9}, { 671,10}, { 367,11}, { 191,10}, \ + { 399, 9}, { 799,10}, { 415,12}, { 127,11}, \ + { 255,10}, { 527,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,11}, { 543,10}, { 1119,11}, \ + { 607,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,12}, { 383,11}, { 863,12}, { 447,11}, \ + { 991,12}, { 511,11}, { 1119,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,12}, { 959,11}, { 1983,13}, { 511,12}, \ + { 1087,11}, { 2239,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \ + { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2559,13}, \ + { 5119,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3327,13}, { 6911,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 159 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 17 +#define MULLO_DC_THRESHOLD 40 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 141 +#define SQRLO_SQR_THRESHOLD 10821 + +#define DC_DIV_QR_THRESHOLD 30 +#define DC_DIVAPPR_Q_THRESHOLD 190 +#define DC_BDIV_QR_THRESHOLD 67 +#define DC_BDIV_Q_THRESHOLD 254 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 157 +#define INV_APPR_THRESHOLD 163 + +#define BINV_NEWTON_THRESHOLD 236 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1895 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 54 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1898 + +#define POWM_SEC_TABLE 1,16,95,480,1442 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 372 +#define SET_STR_PRECOMPUTE_THRESHOLD 1037 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 3 /* 6.26% faster than 1 */ +#define HGCD_THRESHOLD 70 +#define HGCD_APPR_THRESHOLD 129 +#define HGCD_REDUCE_THRESHOLD 3664 +#define GCD_DC_THRESHOLD 573 +#define GCDEXT_DC_THRESHOLD 483 +#define JACOBI_BASE_METHOD 1 /* 27.01% faster than 3 */ + +/* Tuneup completed successfully, took 35232 seconds */ diff --git a/gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h b/gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h new file mode 100644 index 0000000..4428b4b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/coreinhm/gmp-mparam.h @@ -0,0 +1,223 @@ +/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2933-3200 MHz Intel Xeon X3470 Nehalem */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 36 +#define MOD_1_UNNORM_THRESHOLD 40 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 3 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 42.59% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 9 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 17 + +#define DIV_1_VS_MUL_1_PERCENT 288 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 214 +#define MUL_TOOM6H_THRESHOLD 306 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 134 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 118 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 38 +#define SQR_TOOM3_THRESHOLD 133 +#define SQR_TOOM4_THRESHOLD 212 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 620 + +#define MULMID_TOOM42_THRESHOLD 68 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 595 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 595, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 17, 5}, { 35, 6}, { 28, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 99, 9}, { 55,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,11}, { 63, 9}, { 255,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 159,10}, { 335,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671,12}, \ + { 191,11}, { 383,10}, { 767,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1119,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,10}, { 1727,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1119,12}, { 575,11}, { 1215,10}, { 2431,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1471,10}, \ + { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1727,10}, { 3455,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2239,10}, \ + { 4479,12}, { 1215,11}, { 2431,13}, { 639,12}, \ + { 1471,11}, { 2943,13}, { 767,12}, { 1727,11}, \ + { 3455,13}, { 895,12}, { 1983,14}, { 511,13}, \ + { 1023,12}, { 2239,11}, { 4479,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2559,13}, { 1407,12}, \ + { 2943,11}, { 5887,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 6015,15}, { 1535,14}, \ + { 3839,13}, { 7679,16} } +#define MUL_FFT_TABLE3_SIZE 170 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 525, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 39, 9}, \ + { 23, 8}, { 55, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 143, 9}, { 287,10}, { 159, 6}, { 2687, 7}, \ + { 1407, 9}, { 367, 8}, { 735, 9}, { 383,10}, \ + { 207, 9}, { 415,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,10}, { 1727,12}, { 447,11}, { 991,10}, \ + { 1983,13}, { 255,12}, { 511,11}, { 1119,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \ + { 3455,12}, { 895,11}, { 1791,12}, { 959,11}, \ + { 1983,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2239,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1983,11}, { 3967,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \ + { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,12}, { 3967,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3327,13}, \ + { 6655,14}, { 3839,13}, { 7935,16} } +#define SQR_FFT_TABLE3_SIZE 187 +#define SQR_FFT_THRESHOLD 5312 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 43 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 42 +#define SQRLO_SQR_THRESHOLD 10323 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 132 +#define DC_BDIV_QR_THRESHOLD 83 +#define DC_BDIV_Q_THRESHOLD 130 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 189 +#define INV_APPR_THRESHOLD 167 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_N_THRESHOLD 83 + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 97 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1718 + +#define POWM_SEC_TABLE 1,28,96,473,803 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 145 +#define SET_STR_PRECOMPUTE_THRESHOLD 419 + +#define FAC_DSC_THRESHOLD 114 +#define FAC_ODD_THRESHOLD 57 + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 1 /* 1.03% faster than 3 */ +#define HGCD_THRESHOLD 117 +#define HGCD_APPR_THRESHOLD 137 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 389 +#define GCDEXT_DC_THRESHOLD 318 +#define JACOBI_BASE_METHOD 4 /* 6.10% faster than 1 */ + +/* Tuneup completed successfully, took 67994 seconds */ diff --git a/gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h b/gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h new file mode 100644 index 0000000..23d708a --- /dev/null +++ b/gmp-6.3.0/mpn/x86/coreisbr/gmp-mparam.h @@ -0,0 +1,215 @@ +/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 28 +#define MOD_1_UNNORM_THRESHOLD 26 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 4 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 2 /* 88.29% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 21 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 14 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 297 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 105 +#define MUL_TOOM44_THRESHOLD 190 +#define MUL_TOOM6H_THRESHOLD 294 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 109 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 144 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 116 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 160 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 48 +#define SQR_TOOM3_THRESHOLD 163 +#define SQR_TOOM4_THRESHOLD 250 +#define SQR_TOOM6_THRESHOLD 354 +#define SQR_TOOM8_THRESHOLD 502 + +#define MULMID_TOOM42_THRESHOLD 98 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 666 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 666, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 159, 7}, { 1343, 8}, \ + { 703, 9}, { 367, 8}, { 735, 9}, { 383,10}, \ + { 207,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,12}, { 191,11}, \ + { 383,10}, { 799,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2239,12}, \ + { 1215,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1535,12}, \ + { 3071,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,13}, { 7679,16} } +#define MUL_FFT_TABLE3_SIZE 163 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 570 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 570, 5}, { 28, 6}, { 15, 5}, { 32, 6}, \ + { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 40, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63, 8}, { 1023, 9}, \ + { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 991,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2239,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1983,14}, { 511,13}, \ + { 1023,12}, { 2239,13}, { 1151,12}, { 2495,13}, \ + { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \ + { 3967,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2559,13}, { 5119,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,13}, { 7679,16} } +#define SQR_FFT_TABLE3_SIZE 163 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 16 +#define MULLO_DC_THRESHOLD 46 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 159 +#define SQRLO_SQR_THRESHOLD 11317 + +#define DC_DIV_QR_THRESHOLD 47 +#define DC_DIVAPPR_Q_THRESHOLD 191 +#define DC_BDIV_QR_THRESHOLD 107 +#define DC_BDIV_Q_THRESHOLD 232 + +#define INV_MULMOD_BNM1_THRESHOLD 62 +#define INV_NEWTON_THRESHOLD 181 +#define INV_APPR_THRESHOLD 182 + +#define BINV_NEWTON_THRESHOLD 378 +#define REDC_1_TO_REDC_N_THRESHOLD 91 + +#define MU_DIV_QR_THRESHOLD 1858 +#define MU_DIVAPPR_Q_THRESHOLD 1858 +#define MUPI_DIV_QR_THRESHOLD 77 +#define MU_BDIV_QR_THRESHOLD 1830 +#define MU_BDIV_Q_THRESHOLD 2166 + +#define POWM_SEC_TABLE 1,16,126,428,1442 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 418 +#define SET_STR_PRECOMPUTE_THRESHOLD 1104 + +#define FAC_DSC_THRESHOLD 149 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 1 /* 5.54% faster than 4 */ +#define HGCD_THRESHOLD 66 +#define HGCD_APPR_THRESHOLD 135 +#define HGCD_REDUCE_THRESHOLD 4284 +#define GCD_DC_THRESHOLD 642 +#define GCDEXT_DC_THRESHOLD 465 +#define JACOBI_BASE_METHOD 3 /* 14.76% faster than 4 */ + +/* Tuneup completed successfully, took 44241 seconds */ diff --git a/gmp-6.3.0/mpn/x86/darwin.m4 b/gmp-6.3.0/mpn/x86/darwin.m4 new file mode 100644 index 0000000..c449216 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/darwin.m4 @@ -0,0 +1,102 @@ +divert(-1) +dnl Copyright 2007, 2011, 2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`DARWIN') + + +dnl Usage LEA(symbol,reg) +dnl Usage LEAL(symbol_local_to_file,reg) +dnl +dnl We maintain lists of stuff to append in load_eip and darwin_bd. The +dnl `index' stuff is needed to suppress repeated definitions. To avoid +dnl getting fooled by "var" and "var1", we add 'bol ' (the end of +dnl 'indirect_symbol') at the beginning and and a newline at the end. This +dnl might be a bit fragile. + +define(`LEA', +m4_assert_numargs(2) +`ifdef(`PIC',` +ifelse(index(defn(`load_eip'), `$2'),-1, +`m4append(`load_eip', +` TEXT + ALIGN(16) +L(movl_eip_`'substr($2,1)): + movl (%esp), $2 + ret_internal +')') +ifelse(index(defn(`darwin_bd'), `bol $1 +'),-1, +`m4append(`darwin_bd', +` .section __IMPORT,__pointers,non_lazy_symbol_pointers +L($1`'$non_lazy_ptr): + .indirect_symbol $1 + .long 0 +')') + call L(movl_eip_`'substr($2,1)) + movl L($1`'$non_lazy_ptr)-.($2), $2 +',` + movl `$'$1, $2 +')') + +define(`LEAL', +m4_assert_numargs(2) +`ifdef(`PIC',` +ifelse(index(defn(`load_eip'), `$2'),-1, +`m4append(`load_eip', +` TEXT + ALIGN(16) +L(movl_eip_`'substr($2,1)): + movl (%esp), $2 + ret_internal +')') + call L(movl_eip_`'substr($2,1)) + leal $1-.($2), $2 +',` + movl `$'$1, $2 +')') + + +dnl ASM_END + +define(`ASM_END',`load_eip`'darwin_bd') + +define(`load_eip', `') dnl updated in LEA +define(`darwin_bd', `') dnl updated in LEA + + +dnl Usage: CALL(funcname) +dnl + +define(`CALL', +m4_assert_numargs(1) +`call GSYM_PREFIX`'$1') + +undefine(`PIC_WITH_EBX') + +divert`'dnl diff --git a/gmp-6.3.0/mpn/x86/dive_1.asm b/gmp-6.3.0/mpn/x86/dive_1.asm new file mode 100644 index 0000000..5bb0f45 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/dive_1.asm @@ -0,0 +1,190 @@ +dnl x86 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P54 30.0 +C P55 29.0 +C P6 13.0 odd divisor, 12.0 even (strangely) +C K6 14.0 +C K7 12.0 +C P4 42.0 + + +C mp_limb_t mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_SRC') + + TEXT + + ALIGN(16) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + pushl %ebp FRAME_pushl() + + movl PARAM_SIZE, %ebp + pushl %edi FRAME_pushl() + + pushl %ebx FRAME_pushl() + movl $-1, %ecx C shift count + + pushl %esi FRAME_pushl() + +L(strip_twos): + incl %ecx + + shrl %eax + jnc L(strip_twos) + + leal 1(%eax,%eax), %ebx C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %edx) + movzbl (%eax,%edx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + leal (%eax,%eax), %edx C 2*inv + movl %ebx, PARAM_DIVISOR C d without twos + + imull %eax, %eax C inv*inv + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + + imull %ebx, %eax C inv*inv*d + + subl %eax, %edx C inv = 2*inv - inv*inv*d + leal (%edx,%edx), %eax C 2*inv + + imull %edx, %edx C inv*inv + + leal (%esi,%ebp,4), %esi C src end + leal (%edi,%ebp,4), %edi C dst end + negl %ebp C -size + + imull %ebx, %edx C inv*inv*d + + subl %edx, %eax C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + movl (%esi,%ebp,4), %eax C src[0] + + xorl %ebx, %ebx + xorl %edx, %edx + + incl %ebp + jz L(one) + + movl (%esi,%ebp,4), %edx C src[1] + + shrdl( %cl, %edx, %eax) + + movl VAR_INVERSE, %edx + jmp L(entry) + + + ALIGN(8) + nop C k6 code alignment + nop +L(top): + C eax q + C ebx carry bit, 0 or -1 + C ecx shift + C edx carry limb + C esi src end + C edi dst end + C ebp counter, limbs, negative + + movl -4(%esi,%ebp,4), %eax + subl %ebx, %edx C accumulate carry bit + + movl (%esi,%ebp,4), %ebx + + shrdl( %cl, %ebx, %eax) + + subl %edx, %eax C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebx, %ebx + +L(entry): + imull %edx, %eax + + movl %eax, -4(%edi,%ebp,4) + movl PARAM_DIVISOR, %edx + + mull %edx + + incl %ebp + jnz L(top) + + + movl -4(%esi), %eax C src high limb +L(one): + shrl %cl, %eax + popl %esi FRAME_popl() + + addl %ebx, %eax C apply carry bit + popl %ebx FRAME_popl() + + subl %edx, %eax C apply carry limb + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + + popl %edi + popl %ebp + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/divrem_1.asm b/gmp-6.3.0/mpn/x86/divrem_1.asm new file mode 100644 index 0000000..255d493 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/divrem_1.asm @@ -0,0 +1,233 @@ +dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient. + +dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C 486 approx 43 maybe +C P5 44 +C P6 39 +C P6MMX 39 +C K6 22 +C K7 42 +C P4 58 + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C Divide src,size by divisor and store the quotient in dst+xsize,size. +C Extend the division to fractional quotient limbs in dst,xsize. Return the +C remainder. Either or both xsize and size can be 0. +C +C mpn_divrem_1c takes a carry parameter which is an initial high limb, +C effectively one extra limb at the top of src,size. Must have +C carry /* for printf */ +#include /* for getenv */ +#include + +#include "gmp-impl.h" + +/* Change this to "#define TRACE(x) x" for some traces. */ +#define TRACE(x) + + +/* fat_entry.asm */ +long __gmpn_cpuid (char [12], int); +int __gmpn_cpuid_available (void); + + +#if WANT_FAKE_CPUID +/* The "name"s in the table are values for the GMP_CPU_TYPE environment + variable. Anything can be used, but for now it's the canonical cpu types + as per config.guess/config.sub. */ + +#define __gmpn_cpuid fake_cpuid +#define __gmpn_cpuid_available fake_cpuid_available + +#define MAKE_FMS(family, model) \ + ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20) \ + + (((model) & 0xf) << 4) + (((model) & 0xf0) << 12)) + +static struct { + const char *name; + const char *vendor; + unsigned fms; +} fake_cpuid_table[] = { + { "i386", "" }, + { "i486", "GenuineIntel", MAKE_FMS (4, 0) }, + { "pentium", "GenuineIntel", MAKE_FMS (5, 0) }, + { "pentiummmx", "GenuineIntel", MAKE_FMS (5, 4) }, + { "pentiumpro", "GenuineIntel", MAKE_FMS (6, 0) }, + { "pentium2", "GenuineIntel", MAKE_FMS (6, 2) }, + { "pentium3", "GenuineIntel", MAKE_FMS (6, 7) }, + { "pentium4", "GenuineIntel", MAKE_FMS (15, 2) }, + { "prescott", "GenuineIntel", MAKE_FMS (15, 3) }, + { "nocona", "GenuineIntel", MAKE_FMS (15, 4) }, + { "core2", "GenuineIntel", MAKE_FMS (6, 0xf) }, + { "nehalem", "GenuineIntel", MAKE_FMS (6, 0x1a) }, + { "nhm", "GenuineIntel", MAKE_FMS (6, 0x1a) }, + { "atom", "GenuineIntel", MAKE_FMS (6, 0x1c) }, + { "westmere", "GenuineIntel", MAKE_FMS (6, 0x25) }, + { "wsm", "GenuineIntel", MAKE_FMS (6, 0x25) }, + { "sandybridge","GenuineIntel", MAKE_FMS (6, 0x2a) }, + { "sbr", "GenuineIntel", MAKE_FMS (6, 0x2a) }, + { "silvermont", "GenuineIntel", MAKE_FMS (6, 0x37) }, + { "slm", "GenuineIntel", MAKE_FMS (6, 0x37) }, + { "haswell", "GenuineIntel", MAKE_FMS (6, 0x3c) }, + { "hwl", "GenuineIntel", MAKE_FMS (6, 0x3c) }, + { "broadwell", "GenuineIntel", MAKE_FMS (6, 0x3d) }, + { "bwl", "GenuineIntel", MAKE_FMS (6, 0x3d) }, + { "skylake", "GenuineIntel", MAKE_FMS (6, 0x5e) }, + { "sky", "GenuineIntel", MAKE_FMS (6, 0x5e) }, + + { "k5", "AuthenticAMD", MAKE_FMS (5, 0) }, + { "k6", "AuthenticAMD", MAKE_FMS (5, 3) }, + { "k62", "AuthenticAMD", MAKE_FMS (5, 8) }, + { "k63", "AuthenticAMD", MAKE_FMS (5, 9) }, + { "athlon", "AuthenticAMD", MAKE_FMS (6, 0) }, + { "k8", "AuthenticAMD", MAKE_FMS (15, 0) }, + { "k10", "AuthenticAMD", MAKE_FMS (16, 0) }, + { "bobcat", "AuthenticAMD", MAKE_FMS (20, 1) }, + { "bulldozer", "AuthenticAMD", MAKE_FMS (21, 1) }, + { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) }, + { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) }, + { "excavator", "AuthenticAMD", MAKE_FMS (21, 0x60) }, + { "jaguar", "AuthenticAMD", MAKE_FMS (22, 1) }, + { "zen", "AuthenticAMD", MAKE_FMS (23, 1) }, + + { "viac3", "CentaurHauls", MAKE_FMS (6, 0) }, + { "viac32", "CentaurHauls", MAKE_FMS (6, 9) }, + { "nano", "CentaurHauls", MAKE_FMS (6, 15) }, +}; + +static int +fake_cpuid_lookup (void) +{ + char *s; + int i; + + s = getenv ("GMP_CPU_TYPE"); + if (s == NULL) + { + printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n"); + abort (); + } + + for (i = 0; i < numberof (fake_cpuid_table); i++) + if (strcmp (s, fake_cpuid_table[i].name) == 0) + return i; + + printf ("GMP_CPU_TYPE=%s unknown\n", s); + abort (); +} + +static int +fake_cpuid_available (void) +{ + return fake_cpuid_table[fake_cpuid_lookup()].vendor[0] != '\0'; +} + +static long +fake_cpuid (char dst[12], int id) +{ + int i = fake_cpuid_lookup(); + + switch (id) { + case 0: + memcpy (dst, fake_cpuid_table[i].vendor, 12); + return 0; + case 1: + return fake_cpuid_table[i].fms; + default: + printf ("fake_cpuid(): oops, unknown id %d\n", id); + abort (); + } +} +#endif + + +typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t)); +typedef DECL_preinv_mod_1 ((*preinv_mod_1_t)); + +struct cpuvec_t __gmpn_cpuvec = { + __MPN(add_n_init), + 0, + 0, + __MPN(addmul_1_init), + 0, + __MPN(bdiv_dbm1c_init), + __MPN(cnd_add_n_init), + __MPN(cnd_sub_n_init), + __MPN(com_init), + __MPN(copyd_init), + __MPN(copyi_init), + __MPN(divexact_1_init), + __MPN(divrem_1_init), + __MPN(gcd_11_init), + __MPN(lshift_init), + __MPN(lshiftc_init), + __MPN(mod_1_init), + __MPN(mod_1_1p_init), + __MPN(mod_1_1p_cps_init), + __MPN(mod_1s_2p_init), + __MPN(mod_1s_2p_cps_init), + __MPN(mod_1s_4p_init), + __MPN(mod_1s_4p_cps_init), + __MPN(mod_34lsub1_init), + __MPN(modexact_1c_odd_init), + __MPN(mul_1_init), + __MPN(mul_basecase_init), + __MPN(mullo_basecase_init), + __MPN(preinv_divrem_1_init), + __MPN(preinv_mod_1_init), + __MPN(redc_1_init), + __MPN(redc_2_init), + __MPN(rshift_init), + __MPN(sqr_basecase_init), + __MPN(sub_n_init), + 0, + __MPN(submul_1_init), + 0 +}; + +int __gmpn_cpuvec_initialized = 0; + +/* The following setups start with generic x86, then overwrite with + specifics for a chip, and higher versions of that chip. + + The arrangement of the setups here will normally be the same as the $path + selections in configure.in for the respective chips. + + This code is reentrant and thread safe. We always calculate the same + decided_cpuvec, so if two copies of the code are running it doesn't + matter which completes first, both write the same to __gmpn_cpuvec. + + We need to go via decided_cpuvec because if one thread has completed + __gmpn_cpuvec then it may be making use of the threshold values in that + vector. If another thread is still running __gmpn_cpuvec_init then we + don't want it to write different values to those fields since some of the + asm routines only operate correctly up to their own defined threshold, + not an arbitrary value. */ + +void +__gmpn_cpuvec_init (void) +{ + struct cpuvec_t decided_cpuvec; + + TRACE (printf ("__gmpn_cpuvec_init:\n")); + + memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec)); + + CPUVEC_SETUP_x86; + CPUVEC_SETUP_fat; + + if (! __gmpn_cpuid_available ()) + { + TRACE (printf (" 80386, or early 80486 without cpuid\n")); + } + else + { + char vendor_string[13]; + char dummy_string[12]; + long fms; + int family, model; + + __gmpn_cpuid (vendor_string, 0); + vendor_string[12] = 0; + + fms = __gmpn_cpuid (dummy_string, 1); + family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff); + model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0); + + if (strcmp (vendor_string, "GenuineIntel") == 0) + { + switch (family) + { + case 4: + TRACE (printf (" 80486 with cpuid\n")); + break; + + case 5: + TRACE (printf (" pentium\n")); + CPUVEC_SETUP_pentium; + if (model == 4 || model == 8) + { + TRACE (printf (" pentiummmx\n")); + CPUVEC_SETUP_pentium_mmx; + } + break; + + case 6: + TRACE (printf (" p6\n")); + CPUVEC_SETUP_p6; + switch (model) + { + case 0x00: + case 0x01: + TRACE (printf (" pentiumpro\n")); + break; + + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + TRACE (printf (" pentium2\n")); + CPUVEC_SETUP_p6_mmx; + break; + + case 0x07: + case 0x08: + case 0x0a: + case 0x0b: + case 0x0c: + TRACE (printf (" pentium3\n")); + CPUVEC_SETUP_p6_mmx; + CPUVEC_SETUP_p6_p3mmx; + break; + + case 0x09: /* Banias */ + case 0x0d: /* Dothan */ + case 0x0e: /* Yonah */ + TRACE (printf (" Banias/Dothan/Yonah\n")); + CPUVEC_SETUP_p6_mmx; + CPUVEC_SETUP_p6_p3mmx; + CPUVEC_SETUP_p6_sse2; + break; + + case 0x0f: /* Conroe Merom Kentsfield Allendale */ + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: /* PNR Wolfdale Yorkfield */ + case 0x18: + case 0x19: + case 0x1d: /* PNR Dunnington */ + TRACE (printf (" Conroe\n")); + CPUVEC_SETUP_p6_mmx; + CPUVEC_SETUP_p6_p3mmx; + CPUVEC_SETUP_p6_sse2; + CPUVEC_SETUP_core2; + break; + + case 0x1c: /* Atom Silverthorne */ + case 0x26: /* Atom Lincroft */ + case 0x27: /* Atom Saltwell */ + case 0x36: /* Atom Cedarview/Saltwell */ + TRACE (printf (" atom\n")); + CPUVEC_SETUP_atom; + CPUVEC_SETUP_atom_mmx; + CPUVEC_SETUP_atom_sse2; + break; + + case 0x37: /* Silvermont */ + case 0x4a: /* Silvermont */ + case 0x4c: /* Airmont */ + case 0x4d: /* Silvermont/Avoton */ + case 0x5a: /* Silvermont */ + TRACE (printf (" silvermont\n")); + CPUVEC_SETUP_atom; + CPUVEC_SETUP_atom_mmx; + CPUVEC_SETUP_atom_sse2; + CPUVEC_SETUP_silvermont; + break; + + case 0x5c: /* Goldmont */ + case 0x5f: /* Goldmont */ + case 0x7a: /* Goldmont Plus */ + TRACE (printf (" goldmont\n")); + CPUVEC_SETUP_atom; + CPUVEC_SETUP_atom_mmx; + CPUVEC_SETUP_atom_sse2; + CPUVEC_SETUP_goldmont; + break; + + case 0x1a: /* NHM Gainestown */ + case 0x1b: + case 0x1e: /* NHM Lynnfield/Jasper */ + case 0x1f: + case 0x20: + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: /* WSM Clarkdale/Arrandale */ + case 0x28: + case 0x29: + case 0x2b: + case 0x2c: /* WSM Gulftown */ + case 0x2e: /* NHM Beckton */ + case 0x2f: /* WSM Eagleton */ + TRACE (printf (" nehalem/westmere\n")); + CPUVEC_SETUP_p6_mmx; + CPUVEC_SETUP_p6_p3mmx; + CPUVEC_SETUP_p6_sse2; + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + break; + + case 0x2a: /* SBR */ + case 0x2d: /* SBR-EP */ + case 0x3a: /* IBR */ + case 0x3e: /* IBR Ivytown */ + case 0x3c: /* Haswell client */ + case 0x3f: /* Haswell server */ + case 0x45: /* Haswell ULT */ + case 0x46: /* Crystal Well */ + case 0x3d: /* Broadwell */ + case 0x47: /* Broadwell */ + case 0x4f: /* Broadwell server */ + case 0x56: /* Broadwell microserver */ + case 0x4e: /* Skylake client */ + case 0x55: /* Skylake server */ + case 0x5e: /* Skylake */ + case 0x8e: /* Kabylake */ + case 0x9e: /* Kabylake */ + TRACE (printf (" sandybridge\n")); + CPUVEC_SETUP_p6_mmx; + CPUVEC_SETUP_p6_p3mmx; + CPUVEC_SETUP_p6_sse2; + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + break; + } + break; + + case 15: + TRACE (printf (" pentium4\n")); + CPUVEC_SETUP_pentium4; + CPUVEC_SETUP_pentium4_mmx; + CPUVEC_SETUP_pentium4_sse2; + break; + } + } + else if (strcmp (vendor_string, "AuthenticAMD") == 0) + { + switch (family) + { + case 5: + if (model <= 3) + { + TRACE (printf (" k5\n")); + } + else + { + TRACE (printf (" k6\n")); + CPUVEC_SETUP_k6; + CPUVEC_SETUP_k6_mmx; + if (model >= 8) + { + TRACE (printf (" k62\n")); + CPUVEC_SETUP_k6_k62mmx; + } + if (model >= 9) + { + TRACE (printf (" k63\n")); + } + } + break; + case 6: + TRACE (printf (" athlon\n")); + CPUVEC_SETUP_k7; + CPUVEC_SETUP_k7_mmx; + break; + + case 0x0f: /* k8 */ + case 0x11: /* "fam 11h", mix of k8 and k10 */ + case 0x13: /* unknown, conservatively assume k8 */ + TRACE (printf (" k8\n")); + CPUVEC_SETUP_k7; + CPUVEC_SETUP_k7_mmx; + CPUVEC_SETUP_k8; + break; + + case 0x10: /* k10 */ + case 0x12: /* k10 (llano) */ + TRACE (printf (" k10\n")); + CPUVEC_SETUP_k7; + CPUVEC_SETUP_k7_mmx; + break; + + case 0x14: /* bobcat */ + case 0x16: /* jaguar */ + TRACE (printf (" bobcat\n")); + CPUVEC_SETUP_k7; + CPUVEC_SETUP_k7_mmx; + CPUVEC_SETUP_bt1; + break; + + case 0x15: /* bulldozer */ + TRACE (printf (" bulldozer\n")); + CPUVEC_SETUP_k7; + CPUVEC_SETUP_k7_mmx; + CPUVEC_SETUP_bd1; + break; + + case 0x17: /* zen */ + case 0x19: /* zen3 */ + TRACE (printf (" zen\n")); + CPUVEC_SETUP_k7; + CPUVEC_SETUP_k7_mmx; + break; + } + } + else if (strcmp (vendor_string, "CentaurHauls") == 0) + { + switch (family) + { + case 6: + TRACE (printf (" viac3\n")); + if (model >= 9) + { + TRACE (printf (" viac32\n")); + } + if (model >= 15) + { + TRACE (printf (" nano\n")); + CPUVEC_SETUP_nano; + } + break; + } + } + else if (strcmp (vendor_string, "CyrixInstead") == 0) + { + /* Should recognize Cyrix' processors too. */ + TRACE (printf (" cyrix something\n")); + } + } + + /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1. + Instead default to the plain versions from whichever CPU we detected. + The function arguments are compatible, no need for any glue code. */ + if (decided_cpuvec.preinv_divrem_1 == NULL) + decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1; + if (decided_cpuvec.preinv_mod_1 == NULL) + decided_cpuvec.preinv_mod_1 =(preinv_mod_1_t) decided_cpuvec.mod_1; + + ASSERT_CPUVEC (decided_cpuvec); + CPUVEC_INSTALL (decided_cpuvec); + + /* Set this once the threshold fields are ready. + Use volatile to prevent it getting moved. */ + *((volatile int *) &__gmpn_cpuvec_initialized) = 1; +} diff --git a/gmp-6.3.0/mpn/x86/fat/fat_entry.asm b/gmp-6.3.0/mpn/x86/fat/fat_entry.asm new file mode 100644 index 0000000..25655cf --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/fat_entry.asm @@ -0,0 +1,243 @@ +dnl x86 fat binary entrypoints. + +dnl Copyright 2003, 2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +dnl Forcibly disable profiling. +dnl +dnl The entrypoints and inits are small enough not to worry about, the real +dnl routines arrived at will have any profiling. Also, the way the code +dnl here ends with a jump means we won't work properly with the +dnl "instrument" profiling scheme anyway. + +define(`WANT_PROFILING',no) + + + TEXT + + +dnl Usage: FAT_ENTRY(name, offset) +dnl +dnl Emit a fat binary entrypoint function of the given name. This is the +dnl normal entry for applications, eg. __gmpn_add_n. +dnl +dnl The code simply jumps through the function pointer in __gmpn_cpuvec at +dnl the given "offset" (in bytes). +dnl +dnl For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be +dnl fine for all x86s. +dnl +dnl For PIC, the jumps are 20 bytes each, and are best aligned to 16 to +dnl ensure at least the first two instructions don't cross a cache line +dnl boundary. +dnl +dnl Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE +dnl grepping in configure, stopping that code trying to eval something with +dnl $1 in it. + +define(FAT_ENTRY, +m4_assert_numargs(2) +` ALIGN(ifdef(`PIC',16,8)) +`'PROLOGUE($1)dnl +ifdef(`PIC',`dnl +ifdef(`DARWIN',` + call L(movl_eip_edx) + movl L(___gmpn_cpuvec)$non_lazy_ptr-.(%edx), %edx + jmp *m4_empty_if_zero($2)(%edx) +',`dnl + call L(movl_eip_edx) +L(entry_here$2): + addl $_GLOBAL_OFFSET_TABLE_+[.-L(entry_here$2)], %edx + movl GSYM_PREFIX`'__gmpn_cpuvec@GOT(%edx), %edx + jmp *m4_empty_if_zero($2)(%edx) +') +',`dnl non-PIC + jmp *GSYM_PREFIX`'__gmpn_cpuvec+$2 +') +EPILOGUE() +') + + +dnl FAT_ENTRY for each CPUVEC_FUNCS_LIST +dnl + +define(`CPUVEC_offset',0) +foreach(i, +`FAT_ENTRY(MPN(i),CPUVEC_offset) +define(`CPUVEC_offset',eval(CPUVEC_offset + 4))', +CPUVEC_FUNCS_LIST) + +ifdef(`PIC',` + ALIGN(8) +L(movl_eip_edx): + movl (%esp), %edx + ret_internal +ifdef(`DARWIN',` + .section __IMPORT,__pointers,non_lazy_symbol_pointers +L(___gmpn_cpuvec)$non_lazy_ptr: + .indirect_symbol ___gmpn_cpuvec + .long 0 + TEXT +') +') + + +dnl Usage: FAT_INIT(name, offset) +dnl +dnl Emit a fat binary initializer function of the given name. These +dnl functions are the initial values for the pointers in __gmpn_cpuvec. +dnl +dnl The code simply calls __gmpn_cpuvec_init, and then jumps back through +dnl the __gmpn_cpuvec pointer, at the given "offset" (in bytes). +dnl __gmpn_cpuvec_init will have stored the address of the selected +dnl implementation there. +dnl +dnl Only one of these routines will be executed, and only once, since after +dnl that all the __gmpn_cpuvec pointers go to real routines. So there's no +dnl need for anything special here, just something small and simple. To +dnl keep code size down, "fat_init" is a shared bit of code, arrived at +dnl with the offset in %al. %al is used since the movb instruction is 2 +dnl bytes where %eax would be 4. +dnl +dnl Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the +dnl HAVE_NATIVE grepping in configure, preventing that code trying to eval +dnl something with $1 in it. + +define(FAT_INIT, +m4_assert_numargs(2) +`PROLOGUE($1)dnl + movb $`'$2, %al + jmp L(fat_init) +EPILOGUE() +') + +L(fat_init): + C al __gmpn_cpuvec byte offset + + movzbl %al, %eax + pushl %eax + +ifdef(`PIC',`dnl +ifdef(`DARWIN',` + sub $8, %esp + CALL( __gmpn_cpuvec_init) + add $8, %esp + call L(movl_eip_edx) + movl L(___gmpn_cpuvec)$non_lazy_ptr-.(%edx), %edx +',`dnl + pushl %ebx + call L(movl_eip_ebx) +L(init_here): + addl $_GLOBAL_OFFSET_TABLE_+[.-L(init_here)], %ebx + CALL( __gmpn_cpuvec_init) + movl GSYM_PREFIX`'__gmpn_cpuvec@GOT(%ebx), %edx + popl %ebx +') + popl %eax + jmp *(%edx,%eax) + +L(movl_eip_ebx): + movl (%esp), %ebx + ret_internal +',`dnl non-PIC + sub $8, %esp C needed on Darwin, harmless elsewhere + CALL( __gmpn_cpuvec_init) + add $8, %esp C needed on Darwin, harmless elsewhere + popl %eax + jmp *GSYM_PREFIX`'__gmpn_cpuvec(%eax) +') + +dnl FAT_INIT for each CPUVEC_FUNCS_LIST +dnl + +define(`CPUVEC_offset',0) +foreach(i, +`FAT_INIT(MPN(i`'_init),CPUVEC_offset) +define(`CPUVEC_offset',eval(CPUVEC_offset + 4))', +CPUVEC_FUNCS_LIST) + + + +C long __gmpn_cpuid (char dst[12], int id); +C +C This is called only once, so just something simple and compact is fine. + +defframe(PARAM_ID, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +PROLOGUE(__gmpn_cpuid) + pushl %esi FRAME_pushl() + pushl %ebx FRAME_pushl() + movl PARAM_ID, %eax + cpuid + movl PARAM_DST, %esi + movl %ebx, (%esi) + movl %edx, 4(%esi) + movl %ecx, 8(%esi) + popl %ebx + popl %esi + ret +EPILOGUE() + + +C int __gmpn_cpuid_available (void); +C +C Return non-zero if the cpuid instruction is available, which means late +C model 80486 and higher. 80386 and early 80486 don't have cpuid. +C +C The test follows Intel AP-485 application note, namely that if bit 21 is +C modifiable then cpuid is supported. This test is reentrant and thread +C safe, since of course any interrupt or context switch will preserve the +C flags while we're tinkering with them. +C +C This is called only once, so just something simple and compact is fine. + +PROLOGUE(__gmpn_cpuid_available) + pushf + popl %ecx C old flags + + movl %ecx, %edx + xorl $0x200000, %edx + pushl %edx + popf + pushf + popl %edx C tweaked flags + + movl $1, %eax + cmpl %ecx, %edx + jne L(available) + xorl %eax, %eax C not changed, so cpuid not available + +L(available): + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/fat/gmp-mparam.h b/gmp-6.3.0/mpn/x86/fat/gmp-mparam.h new file mode 100644 index 0000000..3641a6b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/gmp-mparam.h @@ -0,0 +1,71 @@ +/* Fat binary x86 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2003, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes. The only time + this might not be true currently is for actual 80386 and 80486 chips, + where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but + that's not worth worrying about. */ +#define DIVEXACT_1_THRESHOLD 0 + +/* Only some of the x86s have an mpn_preinv_divrem_1, but we set + USE_PREINV_DIVREM_1 so that all callers use it, and then let the + __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual + preinv. */ +#define USE_PREINV_DIVREM_1 1 + +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need + for mpn_sqr to call the latter. */ +#define SQR_BASECASE_THRESHOLD 0 + +/* Sensible fallbacks for these, when not taken from a cpu-specific + gmp-mparam.h. */ +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 130 +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 200 + +/* These are values more or less in the middle of what the typical x86 chips + come out as. For a fat binary it's necessary to have values for these, + since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out + as non-constant array initializers. FIXME: Perhaps these should be done + in the cpuvec structure like other thresholds. */ +#define MUL_FFT_TABLE { 464, 928, 1920, 3584, 10240, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 400 +#define MUL_FFT_THRESHOLD 2000 + +#define SQR_FFT_TABLE { 528, 1184, 1920, 4608, 14336, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 500 +#define SQR_FFT_THRESHOLD 3000 diff --git a/gmp-6.3.0/mpn/x86/fat/lshiftc.c b/gmp-6.3.0/mpn/x86/fat/lshiftc.c new file mode 100644 index 0000000..9ecf489 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/lshiftc.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_lshiftc. + +Copyright 2003, 2009, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/lshiftc.c" diff --git a/gmp-6.3.0/mpn/x86/fat/mod_1.c b/gmp-6.3.0/mpn/x86/fat/mod_1.c new file mode 100644 index 0000000..4f149cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/mod_1.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_mod_1. + +Copyright 2003, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mod_1.c" diff --git a/gmp-6.3.0/mpn/x86/fat/mod_1_1.c b/gmp-6.3.0/mpn/x86/fat/mod_1_1.c new file mode 100644 index 0000000..92eaa7a --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/mod_1_1.c @@ -0,0 +1,36 @@ +/* Fat binary fallback mpn_mod_1_1p. + +Copyright 2003, 2009, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* +PROLOGUE(mpn_mod_1_1p_cps) +*/ + +#define OPERATION_mod_1_1_cps 1 +#include "mpn/generic/mod_1_1.c" diff --git a/gmp-6.3.0/mpn/x86/fat/mod_1_2.c b/gmp-6.3.0/mpn/x86/fat/mod_1_2.c new file mode 100644 index 0000000..9095a61 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/mod_1_2.c @@ -0,0 +1,36 @@ +/* Fat binary fallback mpn_mod_1s_2p. + +Copyright 2003, 2009, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* +PROLOGUE(mpn_mod_1s_2p_cps) +*/ + +#define OPERATION_mod_1_2_cps 1 +#include "mpn/generic/mod_1_2.c" diff --git a/gmp-6.3.0/mpn/x86/fat/mod_1_4.c b/gmp-6.3.0/mpn/x86/fat/mod_1_4.c new file mode 100644 index 0000000..51c0def --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/mod_1_4.c @@ -0,0 +1,36 @@ +/* Fat binary fallback mpn_mod_1s_4p. + +Copyright 2003, 2009, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* +PROLOGUE(mpn_mod_1s_4p_cps) +*/ + +#define OPERATION_mod_1_4_cps 1 +#include "mpn/generic/mod_1_4.c" diff --git a/gmp-6.3.0/mpn/x86/fat/mode1o.c b/gmp-6.3.0/mpn/x86/fat/mode1o.c new file mode 100644 index 0000000..870ddb8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/mode1o.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_modexact_1c_odd. + +Copyright 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mode1o.c" diff --git a/gmp-6.3.0/mpn/x86/fat/mullo_basecase.c b/gmp-6.3.0/mpn/x86/fat/mullo_basecase.c new file mode 100644 index 0000000..7f86be6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/mullo_basecase.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_mullo_basecase. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mullo_basecase.c" diff --git a/gmp-6.3.0/mpn/x86/fat/redc_1.c b/gmp-6.3.0/mpn/x86/fat/redc_1.c new file mode 100644 index 0000000..0025403 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/redc_1.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_redc_1. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/redc_1.c" diff --git a/gmp-6.3.0/mpn/x86/fat/redc_2.c b/gmp-6.3.0/mpn/x86/fat/redc_2.c new file mode 100644 index 0000000..1932d58 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/fat/redc_2.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_redc_2. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/redc_2.c" diff --git a/gmp-6.3.0/mpn/x86/gcd_11.asm b/gmp-6.3.0/mpn/x86/gcd_11.asm new file mode 100644 index 0000000..af69135 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/gcd_11.asm @@ -0,0 +1,126 @@ +dnl x86 mpn_gcd_11 optimised for processors with slow BSF. + +dnl Based on C version. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl Rudimentary code for x86-32, i.e. for CPUs without cmov. Also, the bsf +dnl instruction is assumed to be so slow it is useless. Instead a teble is +dnl used. +dnl +dnl The loop benefits from OoO, in-order CPUs might want a different loop. +dnl The ebx and ecx registers could be combined if the assigment of ecx were +dnl postponed until ebx died, but that would at least hurt in-order CPUs. + +C cycles/bit (approx) +C AMD K7 ? +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4-2 ? +C Intel P4-3/4 ? +C Intel P6/13 ? +C Intel CNR ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? +C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1 + +deflit(MAXSHIFT, 6) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u0', `%eax') +define(`v0', `%edx') + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + push %edi + push %esi + push %ebx + + mov 16(%esp), u0 + mov 20(%esp), v0 + LEAL( ctz_table, %esi) + sub v0, u0 C u = u - v 0 + jz L(end) + + ALIGN(16) +L(top): sbb %ebx, %ebx C mask 1 + mov u0, %edi C 1 + mov u0, %ecx C 1 + and %ebx, %edi C 2 + xor %ebx, u0 C 2 + add %edi, v0 C v = min(u.v) 3 + sub %ebx, u0 C u = |u - v| 3 +L(mid): and $MASK, %ecx C 2 + movzbl (%esi,%ecx), %ecx C 3 + jz L(shift_alot) + shr %cl, u0 C 4 + sub v0, u0 C u = u - v 0,5 + jnz L(top) + +L(end): mov v0, %eax + pop %ebx + pop %esi + pop %edi + ret + +L(shift_alot): + shr $MAXSHIFT, u0 + mov u0, %ecx + jmp L(mid) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/geode/gmp-mparam.h b/gmp-6.3.0/mpn/x86/geode/gmp-mparam.h new file mode 100644 index 0000000..cc9c9f1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/geode/gmp-mparam.h @@ -0,0 +1,141 @@ +/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2002, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* Generated by tuneup.c, 2011-01-30, gcc 3.4 */ + +#define MOD_1_NORM_THRESHOLD 6 +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 17 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 14 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define USE_PREINV_DIVREM_1 0 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 42 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 66 +#define MUL_TOOM44_THRESHOLD 105 +#define MUL_TOOM6H_THRESHOLD 141 +#define MUL_TOOM8H_THRESHOLD 212 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 62 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 67 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 33 +#define SQR_TOOM3_THRESHOLD 60 +#define SQR_TOOM4_THRESHOLD 136 +#define SQR_TOOM6_THRESHOLD 196 +#define SQR_TOOM8_THRESHOLD 292 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 468, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 61 +#define MUL_FFT_THRESHOLD 5504 + +#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 396, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135,10}, { 79, 9}, { 159, 8}, \ + { 319,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 61 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 37 +#define MULLO_MUL_N_THRESHOLD 10950 + +#define DC_DIV_QR_THRESHOLD 59 +#define DC_DIVAPPR_Q_THRESHOLD 189 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 136 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 183 +#define INV_APPR_THRESHOLD 181 + +#define BINV_NEWTON_THRESHOLD 204 +#define REDC_1_TO_REDC_N_THRESHOLD 54 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 81 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 998 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 133 +#define GCD_DC_THRESHOLD 451 +#define GCDEXT_DC_THRESHOLD 318 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 547 +#define SET_STR_PRECOMPUTE_THRESHOLD 1049 diff --git a/gmp-6.3.0/mpn/x86/gmp-mparam.h b/gmp-6.3.0/mpn/x86/gmp-mparam.h new file mode 100644 index 0000000..2cb1984 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/gmp-mparam.h @@ -0,0 +1,38 @@ +/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* Generic x86 mpn_divexact_1 is faster than generic x86 mpn_divrem_1 on all + of p5, p6, k6 and k7, so use it always. It's probably slower on 386 and + 486, but that's too bad. */ +#define DIVEXACT_1_THRESHOLD 0 diff --git a/gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h b/gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h new file mode 100644 index 0000000..3d37fa3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/goldmont/gmp-mparam.h @@ -0,0 +1,219 @@ +/* Intel Goldmont/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 7 +#define MOD_1_UNNORM_THRESHOLD 12 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 32.79% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 32 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 23 + +#define DIV_1_VS_MUL_1_PERCENT 228 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 193 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 399 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 125 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 137 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 185 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 113 +#define SQR_TOOM4_THRESHOLD 280 +#define SQR_TOOM6_THRESHOLD 399 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 60 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 368, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 47,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,10}, { 303, 9}, \ + { 607,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 703,10}, \ + { 1407,11}, { 735,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 831,11}, { 1663,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3839,12}, { 7679,15}, \ + { 1023,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2559,13}, { 5119,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,13}, { 7679,16} } +#define MUL_FFT_TABLE3_SIZE 171 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ + { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303, 8}, { 607, 9}, { 319,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \ + { 223,10}, { 479,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 831,11}, \ + { 1663,12}, { 959,11}, { 1919,14}, { 255,13}, \ + { 511,12}, { 1215,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3839,12}, { 7679,15}, { 1023,14}, \ + { 2047,13}, { 4095,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,13}, { 7679,16} } +#define SQR_FFT_TABLE3_SIZE 170 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 5 +#define MULLO_DC_THRESHOLD 50 +#define MULLO_MUL_N_THRESHOLD 6633 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 95 +#define SQRLO_SQR_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 68 +#define DC_DIVAPPR_Q_THRESHOLD 204 +#define DC_BDIV_QR_THRESHOLD 64 +#define DC_BDIV_Q_THRESHOLD 108 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 276 +#define INV_APPR_THRESHOLD 226 + +#define BINV_NEWTON_THRESHOLD 298 +#define REDC_1_TO_REDC_N_THRESHOLD 65 + +#define MU_DIV_QR_THRESHOLD 1528 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 140 +#define MU_BDIV_QR_THRESHOLD 1334 +#define MU_BDIV_Q_THRESHOLD 1499 + +#define POWM_SEC_TABLE 3,16,96,428,1317 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 704 +#define SET_STR_PRECOMPUTE_THRESHOLD 1358 + +#define FAC_DSC_THRESHOLD 95 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD2_DIV1_METHOD 1 /* 5.53% faster than 3 */ +#define HGCD_THRESHOLD 172 +#define HGCD_APPR_THRESHOLD 204 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 610 +#define GCDEXT_DC_THRESHOLD 443 +#define JACOBI_BASE_METHOD 4 /* 6.53% faster than 3 */ + +/* Tuneup completed successfully, took 101563 seconds */ diff --git a/gmp-6.3.0/mpn/x86/i486/gmp-mparam.h b/gmp-6.3.0/mpn/x86/i486/gmp-mparam.h new file mode 100644 index 0000000..aa7dbad --- /dev/null +++ b/gmp-6.3.0/mpn/x86/i486/gmp-mparam.h @@ -0,0 +1,69 @@ +/* 80486 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2001-2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* 100MHz DX4 */ + +/* Generated by tuneup.c, 2003-02-13, gcc 2.95 */ + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 228 + +#define SQR_BASECASE_THRESHOLD 13 +#define SQR_TOOM2_THRESHOLD 49 +#define SQR_TOOM3_THRESHOLD 238 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_DC_THRESHOLD 72 +#define POWM_THRESHOLD 38 + +#define GCD_ACCEL_THRESHOLD 3 +#define JACOBI_BASE_METHOD 2 + +#define USE_PREINV_DIVREM_1 0 +#define USE_PREINV_MOD_1 0 +#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define MODEXACT_1_ODD_THRESHOLD 17 + +#define GET_STR_DC_THRESHOLD 32 +#define GET_STR_PRECOMPUTE_THRESHOLD 82 +#define SET_STR_THRESHOLD 3524 + +#define MUL_FFT_TABLE { 464, 928, 1920, 4608, 10240, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 392 +#define MUL_FFT_THRESHOLD 2816 + +#define SQR_FFT_TABLE { 432, 928, 1920, 4608, 14336, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 392 +#define SQR_FFT_THRESHOLD 2816 diff --git a/gmp-6.3.0/mpn/x86/k10/gmp-mparam.h b/gmp-6.3.0/mpn/x86/k10/gmp-mparam.h new file mode 100644 index 0000000..eceaaae --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k10/gmp-mparam.h @@ -0,0 +1,217 @@ +/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2011, 2014-2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3200-3600 MHz K10 Thuban */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 14 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 29.33% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 35 + +#define DIV_1_VS_MUL_1_PERCENT 258 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 124 +#define MUL_TOOM6H_THRESHOLD 274 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 85 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 113 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 26 +#define SQR_TOOM3_THRESHOLD 105 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 238 +#define SQR_TOOM8_THRESHOLD 309 + +#define MULMID_TOOM42_THRESHOLD 50 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 570, 5}, { 21, 6}, { 11, 5}, { 25, 6}, \ + { 13, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,12}, { 191,11}, { 383,10}, { 799,11}, \ + { 415,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471, 9}, \ + { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,12}, { 447,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \ + { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2239,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2431,13}, { 1407,12}, { 2943,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 168 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 525, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 127,10}, { 79, 9}, \ + { 159,10}, { 95,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671, 9}, { 1343,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 799, 9}, \ + { 1599,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,10}, { 1727,12}, { 447,11}, { 959,10}, \ + { 1919,11}, { 991,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \ + { 3455,12}, { 959,11}, { 1919,13}, { 511,12}, \ + { 1087,11}, { 2239,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2495,13}, { 1279,12}, { 2623,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4351,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4351,14}, \ + { 2303,13}, { 4991,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 166 +#define SQR_FFT_THRESHOLD 5312 + +#define MULLO_BASECASE_THRESHOLD 6 +#define MULLO_DC_THRESHOLD 40 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 113 +#define SQRLO_SQR_THRESHOLD 10323 + +#define DC_DIV_QR_THRESHOLD 56 +#define DC_DIVAPPR_Q_THRESHOLD 248 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 158 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 254 +#define INV_APPR_THRESHOLD 252 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1558 +#define MUPI_DIV_QR_THRESHOLD 114 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1524 + +#define POWM_SEC_TABLE 1,16,102,416,1378 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 270 +#define SET_STR_PRECOMPUTE_THRESHOLD 1105 + +#define FAC_DSC_THRESHOLD 159 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 3 /* 0.70% faster than 4 */ +#define HGCD_THRESHOLD 130 +#define HGCD_APPR_THRESHOLD 163 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 573 +#define GCDEXT_DC_THRESHOLD 393 +#define JACOBI_BASE_METHOD 4 /* 9.13% faster than 1 */ + +/* Tuneup completed successfully, took 52901 seconds */ diff --git a/gmp-6.3.0/mpn/x86/k6/README b/gmp-6.3.0/mpn/x86/k6/README new file mode 100644 index 0000000..1d65af3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/README @@ -0,0 +1,251 @@ +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + AMD K6 MPN SUBROUTINES + + + +This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and +K6-3. + +The mmx subdirectory has MMX code suiting plain K6, the k62mmx subdirectory +has MMX code suiting K6-2 and K6-3. All chips in the K6 family have MMX, +the separate directories are just so that ./configure can omit them if the +assembler doesn't support MMX. + + + + +STATUS + +Times for the loops, with all code and data in L1 cache, are as follows. + + cycles/limb + + mpn_add_n/sub_n 3.25 normal, 2.75 in-place + + mpn_mul_1 6.25 + mpn_add/submul_1 7.65-8.4 (varying with data values) + + mpn_mul_basecase 9.25 cycles/crossproduct (approx) + mpn_sqr_basecase 4.7 cycles/crossproduct (approx) + or 9.2 cycles/triangleproduct (approx) + + mpn_l/rshift 3.0 + + mpn_divrem_1 20.0 + mpn_mod_1 20.0 + mpn_divexact_by3 11.0 + + mpn_copyi 1.0 + mpn_copyd 1.0 + + +K6-2 and K6-3 have dual-issue MMX and get the following improvements. + + mpn_l/rshift 1.75 + + +Prefetching of sources hasn't yet given any joy. With the 3DNow "prefetch" +instruction, code seems to run slower, and with just "mov" loads it doesn't +seem faster. Results so far are inconsistent. The K6 does a hardware +prefetch of the second cache line in a sector, so the penalty for not +prefetching in software is reduced. + + + + +NOTES + +All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow. + +Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can +execute them in both X and Y (and in both together). + +Branch misprediction penalty is 1 to 4 cycles (Optimization Manual +chapter 6 table 12). + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. +Store queue is 7 entries of 64 bits each. + +Floating point multiplications can be done in parallel with integer +multiplications, but there doesn't seem to be any way to make use of this. + + + +OPTIMIZATIONS + +Unrolled loops are used to reduce looping overhead. The unrolling is +configurable up to 32 limbs/loop for most routines, up to 64 for some. + +Sometimes computed jumps into the unrolling are used to handle sizes not a +multiple of the unrolling. An attractive feature of this is that times +smoothly increase with operand size, but an indirect jump is about 6 cycles +and the setups about another 6, so it depends on how much the unrolled code +is faster than a simple loop as to whether a computed jump ought to be used. + +Position independent code is implemented using a call to get eip for +computed jumps and a ret is always done, rather than an addl $4,%esp or a +popl, so the CPU return address branch prediction stack stays synchronised +with the actual stack in memory. Such a call however still costs 4 to 7 +cycles. + +Branch prediction, in absence of any history, will guess forward jumps are +not taken and backward jumps are taken. Where possible it's arranged that +the less likely or less important case is under a taken forward jump. + + + +MMX + +Putting emms or femms as late as possible in a routine seems to be fastest. +Perhaps an emms or femms stalls until all outstanding MMX instructions have +completed, so putting it later gives them a chance to complete on their own, +in parallel with other operations (like register popping). + +The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3 +at the start of a routine, in case it's been preceded by x87 floating point +operations. This isn't done because in gmp programs it's expected that x87 +floating point won't be much used and that chances are an mpn routine won't +have been preceded by any x87 code. + + + +CODING + +Instructions in general code are shown paired if they can decode and execute +together, meaning two short decode instructions with the second not +depending on the first, only the first using the shifter, no more than one +load, and no more than one store. + +K6 does some out of order execution so the pairings aren't essential, they +just show what slots might be available. When decoding is the limiting +factor things can be scheduled that might not execute until later. + + + +NOTES + +Code alignment + +- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary, + short decode is inhibited. The cross.pl script detects this. + +- loops and branch targets should be aligned to 16 bytes, or ensure at least + 2 instructions before a 32 byte boundary. This makes use of the 16 byte + cache in the BTB. + +Addressing modes + +- (%esi) degrades decoding from short to vector. 0(%esi) doesn't have this + problem, and can be used as an equivalent, or easier is just to use a + different register, like %ebx. + +- K6 and pre-CXT core K6-2 have the following problem. (K6-2 CXT and K6-3 + have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F). + + If more than 3 bytes are needed to determine instruction length then + decoding degrades from direct to long, or from long to vector. This + happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since + with mod=00 the sib determines whether there's a displacement. + + This affects all MMX and 3DNow instructions, and others with an 0F prefix, + like movzbl. The modes affected are anything with an index and no + displacement, or an index but no base, and this includes (%esp) which is + really (,%esp,1). + + The cross.pl script detects problem cases. The workaround is to always + use a displacement, and to do this with Zdisp if it's zero so the + assembler doesn't discard it. + + See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages + 13-14 and 36-37. + +Calls + +- indirect jumps and calls are not branch predicted, they measure about 6 + cycles. + +Various + +- adcl 2 cycles of decode, maybe 2 cycles executing in the X pipe +- bsf 12-27 cycles +- emms 5 cycles +- femms 3 cycles +- jecxz 2 cycles taken, 13 not taken (optimization manual says 7 not taken) +- divl 20 cycles back-to-back +- imull 2 decode, 3 execute +- mull 2 decode, 3 execute (optimization manual decoding sample) +- prefetch 2 cycles +- rcll/rcrl implicit by one bit: 2 cycles + immediate or %cl count: 11 + 2 per bit for dword + 13 + 4 per bit for byte +- setCC 2 cycles +- xchgl %eax,reg 1.5 cycles, back-to-back (strange) + reg,reg 2 cycles, back-to-back + + + + +REFERENCES + +"AMD-K6 Processor Code Optimization Application Note", AMD publication +number 21924, revision D amendment 0, January 2000. This describes K6-2 and +K6-3. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21924.pdf + +"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD +publication number 21828, revision A amendment 0, August 1997. This is an +older edition of the above document, describing plain K6. Available +on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21828.pdf + +"3DNow Technology Manual", AMD publication number 21928G/0-March 2000. +This describes the femms and prefetch instructions, but nothing else from +3DNow has been used. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf + +"3DNow Instruction Porting Guide", AMD publication number 22621, revision B, +August 1999. This has some notes on general K6 optimizations as well as +3DNow. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/x86/k6/aors_n.asm b/gmp-6.3.0/mpn/x86/k6/aors_n.asm new file mode 100644 index 0000000..168f9b4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/aors_n.asm @@ -0,0 +1,337 @@ +dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb. + + +ifdef(`OPERATION_add_n', ` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + define(M4_description, add) +',`ifdef(`OPERATION_sub_n', ` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + define(M4_description, subtract) +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C Calculate src1,size M4_description src2,size, and store the result in +C dst,size. The return value is the carry bit from the top of the result +C (1 or 0). +C +C The _nc version accepts 1 or 0 for an initial carry into the low limb of +C the calculation. Note values other than 1 or 0 here will lead to garbage +C results. +C +C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and +C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of +C loop control, which with 4 limbs/loop means an extra 0.25 c/l. + +define(PARAM_CARRY, `FRAME+20(%esp)') +define(PARAM_SIZE, `FRAME+16(%esp)') +define(PARAM_SRC2, `FRAME+12(%esp)') +define(PARAM_SRC1, `FRAME+8(%esp)') +define(PARAM_DST, `FRAME+4(%esp)') +deflit(`FRAME',0) + +dnl minimum 5 because the unrolled code can't handle less +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(32) + +PROLOGUE(M4_function_nc) + movl PARAM_CARRY, %eax + jmp L(start) +EPILOGUE() + + +PROLOGUE(M4_function_n) + xorl %eax, %eax +L(start): + movl PARAM_SIZE, %ecx + pushl %ebx +FRAME_pushl() + + movl PARAM_SRC1, %ebx + pushl %edi +FRAME_pushl() + + movl PARAM_SRC2, %edx + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_DST, %edi + jae L(unroll) + + + shrl %eax C initial carry flag + + C offset 0x21 here, close enough to aligned +L(simple): + C eax scratch + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + C + C The store to (%edi) could be done with a stosl; it'd be smaller + C code, but there's no speed gain and a cld would have to be added + C (per mpn/x86/README). + + movl (%ebx), %eax + leal 4(%ebx), %ebx + + M4_inst (%edx), %eax + + movl %eax, (%edi) + leal 4(%edi), %edi + + leal 4(%edx), %edx + loop L(simple) + + + movl $0, %eax + popl %edi + + setc %al + + popl %ebx + ret + + +C ----------------------------------------------------------------------------- +L(unroll): + C eax carry + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + + cmpl %edi, %ebx + pushl %esi + + je L(inplace) + +ifdef(`OPERATION_add_n',` + cmpl %edi, %edx + + je L(inplace_reverse) +') + + movl %ecx, %esi + + andl $-4, %ecx + andl $3, %esi + + leal (%ebx,%ecx,4), %ebx + leal (%edx,%ecx,4), %edx + leal (%edi,%ecx,4), %edi + + negl %ecx + shrl %eax + + ALIGN(32) +L(normal_top): + C eax counter, qwords, negative + C ebx src1 + C ecx scratch + C edx src2 + C esi + C edi dst + C ebp + + movl (%ebx,%ecx,4), %eax + leal 5(%ecx), %ecx + M4_inst -20(%edx,%ecx,4), %eax + movl %eax, -20(%edi,%ecx,4) + + movl 4-20(%ebx,%ecx,4), %eax + M4_inst 4-20(%edx,%ecx,4), %eax + movl %eax, 4-20(%edi,%ecx,4) + + movl 8-20(%ebx,%ecx,4), %eax + M4_inst 8-20(%edx,%ecx,4), %eax + movl %eax, 8-20(%edi,%ecx,4) + + movl 12-20(%ebx,%ecx,4), %eax + M4_inst 12-20(%edx,%ecx,4), %eax + movl %eax, 12-20(%edi,%ecx,4) + + loop L(normal_top) + + + decl %esi + jz L(normal_finish_one) + js L(normal_done) + + C two or three more limbs + + movl (%ebx), %eax + M4_inst (%edx), %eax + movl %eax, (%edi) + + movl 4(%ebx), %eax + M4_inst 4(%edx), %eax + decl %esi + movl %eax, 4(%edi) + + jz L(normal_done) + movl $2, %ecx + +L(normal_finish_one): + movl (%ebx,%ecx,4), %eax + M4_inst (%edx,%ecx,4), %eax + movl %eax, (%edi,%ecx,4) + +L(normal_done): + popl %esi + popl %edi + + movl $0, %eax + popl %ebx + + setc %al + + ret + + +C ----------------------------------------------------------------------------- + +ifdef(`OPERATION_add_n',` +L(inplace_reverse): + C dst==src2 + + movl %ebx, %edx +') + +L(inplace): + C eax initial carry + C ebx + C ecx size + C edx src + C esi + C edi dst + C ebp + + leal -1(%ecx), %esi + decl %ecx + + andl $-4, %ecx + andl $3, %esi + + movl (%edx), %ebx C src low limb + leal (%edx,%ecx,4), %edx + + leal (%edi,%ecx,4), %edi + negl %ecx + + shrl %eax + + + ALIGN(32) +L(inplace_top): + C eax + C ebx next src limb + C ecx size + C edx src + C esi + C edi dst + C ebp + + M4_inst %ebx, (%edi,%ecx,4) + + movl 4(%edx,%ecx,4), %eax + leal 5(%ecx), %ecx + + M4_inst %eax, 4-20(%edi,%ecx,4) + + movl 8-20(%edx,%ecx,4), %eax + movl 12-20(%edx,%ecx,4), %ebx + + M4_inst %eax, 8-20(%edi,%ecx,4) + M4_inst %ebx, 12-20(%edi,%ecx,4) + + movl 16-20(%edx,%ecx,4), %ebx + loop L(inplace_top) + + + C now %esi is 0 to 3 representing respectively 1 to 4 limbs more + + M4_inst %ebx, (%edi) + + decl %esi + jz L(inplace_finish_one) + js L(inplace_done) + + C two or three more limbs + + movl 4(%edx), %eax + movl 8(%edx), %ebx + M4_inst %eax, 4(%edi) + M4_inst %ebx, 8(%edi) + + decl %esi + movl $2, %ecx + + jz L(normal_done) + +L(inplace_finish_one): + movl 4(%edx,%ecx,4), %eax + M4_inst %eax, 4(%edi,%ecx,4) + +L(inplace_done): + popl %esi + popl %edi + + movl $0, %eax + popl %ebx + + setc %al + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm b/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm new file mode 100644 index 0000000..eaa92eb --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/aorsmul_1.asm @@ -0,0 +1,391 @@ +dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. + +dnl Copyright 1999-2003, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 +C P6 model 0-8,10-12 5.94 +C P6 model 9 (Banias) 5.51 +C P6 model 13 (Dothan) 5.57 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 7.65-8.5 (data dependent) +C AMD K7 +C AMD K8 + + +dnl K6: large multipliers small multipliers +dnl UNROLL_COUNT cycles/limb cycles/limb +dnl 4 9.5 7.78 +dnl 8 9.0 7.78 +dnl 16 8.4 7.65 +dnl 32 8.4 8.2 +dnl +dnl Maximum possible unrolling with the current code is 32. +dnl +dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256 +dnl byte block, which might explain the good speed at that unrolling. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C The jadcl0()s in the unrolled loop makes the speed data dependent. Small +C multipliers (most significant few bits clear) result in few carry bits and +C speeds up to 7.65 cycles/limb are attained. Large multipliers (most +C significant few bits set) make the carry bits 50/50 and lead to something +C more like 8.4 c/l. With adcl's both of these would be 9.3 c/l. +C +C It's important that the gains for jadcl0 on small multipliers don't come +C at the cost of slowing down other data. Tests on uniformly distributed +C random data, designed to confound branch prediction, show about a 7% +C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all +C overheads included). +C +C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus +C 11.0 cycles/limb), and hence isn't used. +C +C In the simple loop, note that running ecx from negative to zero and using +C it as an index in the two movs wouldn't help. It would save one +C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired +C that would be collapsed by this. +C +C Attempts at a simpler main loop, with less unrolling, haven't yielded much +C success, generally running over 9 c/l. +C +C +C jadcl0 +C ------ +C +C jadcl0() being faster than adcl $0 seems to be an artifact of two things, +C firstly the instruction decoding and secondly the fact that there's a +C carry bit for the jadcl0 only on average about 1/4 of the time. +C +C The code in the unrolled loop decodes something like the following. +C +C decode cycles +C mull %ebp 2 +C M4_inst %esi, disp(%edi) 1 +C adcl %eax, %ecx 2 +C movl %edx, %esi \ 1 +C jnc 1f / +C incl %esi \ 1 +C 1: movl disp(%ebx), %eax / +C --- +C 7 +C +C In a back-to-back style test this measures 7 with the jnc not taken, or 8 +C with it taken (both when correctly predicted). This is opposite to the +C measurements showing small multipliers running faster than large ones. +C Don't really know why. +C +C It's not clear how much branch misprediction might be costing. The K6 +C doco says it will be 1 to 4 cycles, but presumably it's near the low end +C of that range to get the measured results. +C +C +C In the code the two carries are more or less the preceding mul product and +C the calculation is roughly +C +C x*y + u*b+v +C +C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and +C v are the two limbs it's added to (being the low of the next mul, and a +C limb from the destination). +C +C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and +C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of +C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 +C and b-1, then the total probability can be summed over x and y, +C +C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) +C --- * sum sum --- = --- * ------- * ------- = 1/4 +C b^2 x=0 y=1 b^2 b^4 2 2 +C +C Actually it's a very tiny bit less than 1/4 of course. If y is fixed, +C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. + + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 9) +',` +deflit(UNROLL_THRESHOLD, 6) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %esi +deflit(`FRAME',4) + movl PARAM_CARRY, %esi + jmp L(start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %esi +deflit(`FRAME',4) + xorl %esi, %esi C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %ebx +deflit(`FRAME',8) + + movl PARAM_SRC, %ebx + pushl %edi +deflit(`FRAME',12) + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_DST, %edi + + pushl %ebp +deflit(`FRAME',16) + jae L(unroll) + + + C simple loop + + movl PARAM_MULTIPLIER, %ebp + +L(simple): + C eax scratch + C ebx src + C ecx counter + C edx scratch + C esi carry + C edi dst + C ebp multiplier + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl $4, %edi + addl %esi, %eax + + adcl $0, %edx + + M4_inst %eax, -4(%edi) + + adcl $0, %edx + + movl %edx, %esi + loop L(simple) + + + popl %ebp + popl %edi + + popl %ebx + movl %esi, %eax + + popl %esi + ret + + + +C ----------------------------------------------------------------------------- +C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop +C the carries are ecx=lo, esi=hi, then they swap for each limb processed. +C For the computed jump an odd size means they start one way around, an even +C size the other. +C +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers at the point of doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %esi is necessary only for the +C mpn_addmul/submul_1c entry points. Duplicating the startup code to +C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER, `PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + +L(unroll): + C eax + C ebx src + C ecx size + C edx + C esi initial carry + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + shll $4, %edx + negl %ecx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%ebx), %eax C src low limb + + movl PARAM_MULTIPLIER, %ebp + movl %edx, VAR_JUMP + + mull %ebp + + addl %esi, %eax C initial carry (from _1c) + jadcl0( %edx) + + + leal 4(%ebx,%ecx,4), %ebx + movl %edx, %esi C high carry + + movl VAR_JUMP, %edx + leal (%edi,%ecx,4), %edi + + testl $1, %ecx + movl %eax, %ecx C low carry + + jz L(noswap) + movl %esi, %ecx C high,low carry other way around + + movl %eax, %esi +L(noswap): + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret_internal +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx src + C ecx carry lo + C edx scratch + C esi carry hi + C edi dst + C ebp multiplier + C + C 15 code bytes per limb + + leal UNROLL_BYTES(%edi), %edi + +L(entry): +forloop(`i', 0, UNROLL_COUNT/2-1, ` + deflit(`disp0', eval(2*i*4)) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) + + movl disp1(%ebx), %eax + mull %ebp + M4_inst %esi, disp1(%edi) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +') + + decl VAR_COUNTER + + leal UNROLL_BYTES(%ebx), %ebx + jns L(top) + + + popl %ebp + M4_inst %ecx, UNROLL_BYTES(%edi) + + popl %edi + movl %esi, %eax + + popl %ebx + jadcl0( %eax) + + popl %esi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/cross.pl b/gmp-6.3.0/mpn/x86/k6/cross.pl new file mode 100755 index 0000000..fc921a5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/cross.pl @@ -0,0 +1,182 @@ +#! /usr/bin/perl + +# Copyright 2000, 2001 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: cross.pl [filename.o]... +# +# Produce an annotated disassembly of the given object files, indicating +# certain code alignment and addressing mode problems afflicting K6 chips. +# "ZZ" is used on all annotations, so this can be searched for. +# +# With no arguments, all .o files corresponding to .asm files are processed. +# This is good in the mpn object directory of a k6*-*-* build. +# +# Code alignments of 8 bytes or more are handled. When 32 is used, cache +# line boundaries will fall in at offsets 0x20,0x40,etc and problems are +# flagged at those locations. When 16 is used, the line boundaries can also +# fall at offsets 0x10,0x30,0x50,etc, depending where the file is loaded, so +# problems are identified there too. Likewise when 8 byte alignment is used +# problems are flagged additionally at 0x08,0x18,0x28,etc. +# +# Usually 32 byte alignment is used for k6 routines, but less is certainly +# possible if through good luck, or a little tweaking, cache line crossing +# problems can be avoided at the extra locations. +# +# Bugs: +# +# Instructions without mod/rm bytes or which are already vector decoded are +# unaffected by cache line boundary crossing, but not all of these have yet +# been put in as exceptions. All that occur in practice in GMP are present +# though. +# +# There's no messages for using the vector decoded addressing mode (%esi), +# but that's easy to avoid when coding. +# +# Future: +# +# Warn about jump targets that are poorly aligned (less than 2 instructions +# before a cache line boundary). + +use strict; + +sub disassemble { + my ($file) = @_; + my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm); + my $align; + + open (IN, "objdump -Srfh $file |") + || die "Cannot open pipe from objdump\n"; + while () { + print; + + if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) { + $align = 1 << $1; + if ($align < 8) { + print "ZZ cross.pl cannot handle alignment < 2**3\n"; + $align = 8 + } + } + + if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4); + + } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,$3,''); + + } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,'',''); + + } else { + next; + } + + if ($b1 =~ /0f/) { + $prefix = $b1; + $opcode = $b2; + $modrm = $b3; + } else { + $prefix = ''; + $opcode = $b1; + $modrm = $b2; + } + + # modrm of the form 00-xxx-100 with an 0F prefix is the problem case + # for K6 and pre-CXT K6-2 + if ($prefix =~ /0f/ + && $opcode !~ /^8/ # jcond disp32 + && $modrm =~ /^[0-3][4c]/) { + print "ZZ ($file) >3 bytes to determine instruction length [K6]\n"; + } + + # with just an opcode, starting 1f mod 20h + if (($align==32 && $addr =~ /[13579bdf]f$/ + || $align==16 && $addr =~ /f$/ + || $align==8 && $addr =~ /[7f]$/) + && $prefix !~ /0f/ + && $opcode !~ /1[012345]/ # adc + && $opcode !~ /1[89abcd]/ # sbb + && $opcode !~ /^4/ # inc/dec reg + && $opcode !~ /^5/ # push/pop reg + && $opcode !~ /68/ # push $imm32 + && $opcode !~ /^7/ # jcond disp8 + && $opcode !~ /a[89]/ # test+imm + && $opcode !~ /a[a-f]/ # stos/lods/scas + && $opcode !~ /b8/ # movl $imm32,%eax + && $opcode !~ /d[0123]/ # rcl + && $opcode !~ /e[0123]/ # loop/loopz/loopnz/jcxz + && $opcode !~ /e8/ # call disp32 + && $opcode !~ /e[9b]/ # jmp disp32/disp8 + && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std + && !($opcode =~ /f[67]/ # grp 1 + && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv + && $modrm !~ /^$/) { + print "ZZ ($file) opcode/modrm cross 32-byte boundary\n"; + } + + # with an 0F prefix, anything starting at 1f mod 20h + if (($align==32 && $addr =~ /[13579bdf][f]$/ + || $align==16 && $addr =~ /f$/ + || $align==8 && $addr =~ /[7f]$/) + && $prefix =~ /0f/ + && $opcode !~ /af/ # imul + && $opcode !~ /a[45]/ # shldl + && $opcode !~ /a[cd]/ # shrdl + ) { + print "ZZ ($file) prefix/opcode cross 32-byte boundary\n"; + } + + # with an 0F prefix, anything with mod/rm starting at 1e mod 20h + if (($align==32 && $addr =~ /[13579bdf][e]$/ + || $align==16 && $addr =~ /[e]$/ + || $align==8 && $addr =~ /[6e]$/) + && $prefix =~ /0f/ + && $opcode !~ /^8/ # jcond disp32 + && $opcode !~ /af/ # imull reg,reg + && $opcode !~ /a[45]/ # shldl + && $opcode !~ /a[cd]/ # shrdl + && $modrm !~ /^$/) { + print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n"; + } + } + close IN || die "Error from objdump (or objdump not available)\n"; +} + + +my @files; +if ($#ARGV >= 0) { + @files = @ARGV; +} else { + @files = glob "*.asm"; + map {s/.asm/.o/} @files; +} + +foreach (@files) { + disassemble($_); +} diff --git a/gmp-6.3.0/mpn/x86/k6/divrem_1.asm b/gmp-6.3.0/mpn/x86/k6/divrem_1.asm new file mode 100644 index 0000000..b4cea4f --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/divrem_1.asm @@ -0,0 +1,203 @@ +dnl AMD K6 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 1999-2003, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 20 cycles/limb + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop +C instead of decl+jnz, since it comes out 2 cycles/limb faster. +C +C A test is done to see if the high limb is less than the divisor, and if so +C one less div is done. A div is 20 cycles, so assuming high= 0 and + C and leaves 0 to 3 which can be tested with test $1 and $2. + +L(top): + C eax counter, -(size-7) step by +4 until >=0 + C ebx src end - 32 + C ecx dst end - 32 + C edx retval + C + C mm0 src next qword + C mm1 scratch + C mm2 src prev qword + C mm6 shift + C mm7 64-shift + + psrlq %mm6, %mm2 + addl $4, %eax + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm0, %mm2 + movq 4(%ebx,%eax,4), %mm0 + + psrlq %mm6, %mm1 + movq %mm2, -12(%ecx,%eax,4) + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm0, %mm1 + movq 12(%ebx,%eax,4), %mm0 + + movq %mm1, -4(%ecx,%eax,4) + ja L(top) C jump if no carry and not zero + + + + C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0 + C to 3 representing respectively 3 to 0 further limbs. + + testl $2, %eax C testl to avoid bad cache line crossings + jnz L(finish_nottwo) + + C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0 + C becomes new mm2 and a new mm0 is loaded. + + psrlq %mm6, %mm2 + movq %mm0, %mm1 + + psllq %mm7, %mm0 + addl $2, %eax + + por %mm0, %mm2 + movq 12(%ebx,%eax,4), %mm0 + + movq %mm2, -4(%ecx,%eax,4) + movq %mm1, %mm2 +L(finish_nottwo): + + + testb $1, %al + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm0, %mm2 + psrlq %mm6, %mm1 + + movq %mm2, 4(%ecx,%eax,4) + jnz L(finish_even) + + + C one further extra limb to process + + movd 32-4(%ebx), %mm0 C src[size-1], most significant limb + popl %ebx + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm0, %mm1 + psrlq %mm6, %mm2 + + movq %mm1, 32-12(%ecx) C dst[size-3,size-2] + movd %mm2, 32-4(%ecx) C dst[size-1] + + movl %edx, %eax C retval + + femms + ret + + + nop C avoid bad cache line crossing +L(finish_even): + C no further extra limbs + + movq %mm1, 32-8(%ecx) C dst[size-2,size-1] + movl %edx, %eax C retval + + popl %ebx + + femms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/com.asm b/gmp-6.3.0/mpn/x86/k6/mmx/com.asm new file mode 100644 index 0000000..b747454 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/mmx/com.asm @@ -0,0 +1,103 @@ +dnl AMD K6-2 mpn_com -- mpn bitwise one's complement. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +NAILS_SUPPORT(0-31) + + +C alignment dst/src, A=0mod8 N=4mod8 +C A/A A/N N/A N/N +C K6-2 1.0 1.18 1.18 1.18 cycles/limb +C K6 1.5 1.85 1.75 1.85 + + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Take the bitwise ones-complement of src,size and write it to dst,size. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(16) +PROLOGUE(mpn_com) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + shrl %ecx + jnz L(two_or_more) + + movl (%eax), %eax + notl_or_xorl_GMP_NUMB_MASK( %eax) + movl %eax, (%edx) + ret + + +L(two_or_more): + pushl %ebx FRAME_pushl() + pcmpeqd %mm7, %mm7 C all ones + + movl %ecx, %ebx +ifelse(GMP_NAIL_BITS,0,, +` psrld $GMP_NAIL_BITS, %mm7') C clear nails + + + + ALIGN(8) +L(top): + C eax src + C ebx floor(size/2) + C ecx counter + C edx dst + C + C mm0 scratch + C mm7 mask + + movq -8(%eax,%ecx,8), %mm0 + pxor %mm7, %mm0 + movq %mm0, -8(%edx,%ecx,8) + loop L(top) + + + jnc L(no_extra) + movl (%eax,%ebx,8), %eax + notl_or_xorl_GMP_NUMB_MASK( %eax) + movl %eax, (%edx,%ebx,8) +L(no_extra): + + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm b/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm new file mode 100644 index 0000000..1bbad3a --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm @@ -0,0 +1,282 @@ +dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C divisor +C odd even +C K6: 10.0 12.0 cycles/limb +C K6-2: 10.0 11.5 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C A simple divl is used for size==1. This is about 10 cycles faster for an +C odd divisor or 20 cycles for an even divisor. +C +C The loops are quite sensitive to code alignment, speeds should be +C rechecked (odd and even divisor, pic and non-pic) if contemplating +C changing anything. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_DST') + + TEXT + + ALIGN(32) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + + movl PARAM_SRC, %eax + xorl %edx, %edx + + cmpl $1, %ecx + jnz L(two_or_more) + + movl (%eax), %eax + + divl PARAM_DIVISOR + + movl PARAM_DST, %ecx + movl %eax, (%ecx) + + ret + + +L(two_or_more): + movl PARAM_DIVISOR, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + pushl %ebp FRAME_pushl() + +L(strip_twos): + shrl %eax + incl %edx C will get shift+1 + + jnc L(strip_twos) + pushl %esi FRAME_pushl() + + leal 1(%eax,%eax), %esi C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %ebp) +Zdisp( movzbl, 0,(%eax,%ebp), %eax) +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + pushl %edi FRAME_pushl() + + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + + movl PARAM_DST, %edi + + imull %esi, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + movl %esi, PARAM_DIVISOR C d without twos + leal (%ebx,%ecx,4), %ebx C src end + + imull %esi, %ebp C inv*inv*d + + leal (%edi,%ecx,4), %edi C dst end + negl %ecx C -size + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + subl $1, %edx C shift amount, and clear carry + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + jnz L(even) + + movl (%ebx,%ecx,4), %esi C src low limb + jmp L(odd_entry) + + + ALIGN(16) + nop C code alignment +L(odd_top): + C eax scratch + C ebx src end + C ecx counter, limbs, negative + C edx inverse + C esi next limb, adjusted for carry + C edi dst end + C ebp carry bit, 0 or -1 + + imull %edx, %esi + + movl PARAM_DIVISOR, %eax + movl %esi, -4(%edi,%ecx,4) + + mull %esi C carry limb in edx + + subl %ebp, %edx C apply carry bit + movl (%ebx,%ecx,4), %esi + +L(odd_entry): + subl %edx, %esi C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebp, %ebp C 0 or -1 + + incl %ecx + jnz L(odd_top) + + + imull %edx, %esi + + movl %esi, -4(%edi,%ecx,4) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + + +L(even): + C eax + C ebx src end + C ecx -size + C edx twos + C esi + C edi dst end + C ebp + + xorl %ebp, %ebp +Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1] + + movd %edx, %mm7 + movl VAR_INVERSE, %edx + + addl $2, %ecx + psrlq %mm7, %mm0 + + movd %mm0, %esi + jz L(even_two) C if only two limbs + + +C Out-of-order execution is good enough to hide the load/rshift/movd +C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12, +C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has +C been found. Maybe the fact every second movq is unaligned costs the extra +C 0.5. + +L(even_top): + C eax scratch + C ebx src end + C ecx counter, limbs, negative + C edx inverse + C esi next limb, adjusted for carry + C edi dst end + C ebp carry bit, 0 or -1 + C + C mm0 scratch, source limbs + C mm7 twos + + imull %edx, %esi + + movl %esi, -8(%edi,%ecx,4) + movl PARAM_DIVISOR, %eax + + mull %esi C carry limb in edx + + movq -4(%ebx,%ecx,4), %mm0 + psrlq %mm7, %mm0 + + movd %mm0, %esi + subl %ebp, %edx C apply carry bit + + subl %edx, %esi C apply carry limb + movl VAR_INVERSE, %edx + + sbbl %ebp, %ebp C 0 or -1 + + incl %ecx + jnz L(even_top) + + +L(even_two): + movd -4(%ebx), %mm0 C src high limb + psrlq %mm7, %mm0 + + imull %edx, %esi + + movl %esi, -8(%edi) + movl PARAM_DIVISOR, %eax + + mull %esi C carry limb in edx + + movd %mm0, %esi + subl %ebp, %edx C apply carry bit + + movl VAR_INVERSE, %eax + subl %edx, %esi C apply carry limb + + imull %eax, %esi + + movl %esi, -4(%edi) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + emms_or_femms + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm b/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm new file mode 100644 index 0000000..e17930b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm @@ -0,0 +1,226 @@ +dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, +dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +NAILS_SUPPORT(0-31) + + +C alignment dst/src1/src2, A=0mod8, N=4mod8 +C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +C +C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor +C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor +C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior +C +C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor +C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor +C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior + + +dnl M4_p and M4_i are the MMX and integer instructions +dnl M4_*_neg_dst means whether to negate the final result before writing +dnl M4_*_neg_src2 means whether to negate the src2 values before using them + +define(M4_choose_op, +m4_assert_numargs(7) +`ifdef(`OPERATION_$1',` +define(`M4_function', `mpn_$1') +define(`M4_operation', `$1') +define(`M4_p', `$2') +define(`M4_p_neg_dst', `$3') +define(`M4_p_neg_src2',`$4') +define(`M4_i', `$5') +define(`M4_i_neg_dst', `$6') +define(`M4_i_neg_src2',`$7') +')') + +dnl xnor is done in "iorn" style because it's a touch faster than "nior" +dnl style (the two are equivalent for xor). +dnl +dnl pandn can't be used with nails. + +M4_choose_op( and_n, pand,0,0, andl,0,0) +ifelse(GMP_NAIL_BITS,0, +`M4_choose_op(andn_n, pandn,0,0, andl,0,1)', +`M4_choose_op(andn_n, pand,0,1, andl,0,1)') +M4_choose_op( nand_n, pand,1,0, andl,1,0) +M4_choose_op( ior_n, por,0,0, orl,0,0) +M4_choose_op( iorn_n, por,0,1, orl,0,1) +M4_choose_op( nior_n, por,1,0, orl,1,0) +M4_choose_op( xor_n, pxor,0,0, xorl,0,0) +M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) + +ifdef(`M4_function',, +`m4_error(`Unrecognised or undefined OPERATION symbol +')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + + +C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C +C Do src1,size M4_operation src2,size, storing the result in dst,size. +C +C Unaligned movq loads and stores are a bit slower than aligned ones. The +C test at the start of the routine checks the alignment of src1 and if +C necessary processes one limb separately at the low end to make it aligned. +C +C The raw speeds without this alignment switch are as follows. +C +C alignment dst/src1/src2, A=0mod8, N=4mod8 +C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +C +C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor +C K6 1.75 2.2 2.0 2.28 iorn,xnor +C K6 2.0 2.25 2.35 2.28 nand,nior +C +C +C Future: +C +C K6 can do one 64-bit load per cycle so each of these routines should be +C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be +C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. +C The others are 4 instructions per 2 limbs, and so can only approach 1.0 +C because there's nowhere to hide some loop control. + +defframe(PARAM_SIZE,16) +defframe(PARAM_SRC2,12) +defframe(PARAM_SRC1,8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + TEXT + ALIGN(32) +PROLOGUE(M4_function) + movl PARAM_SIZE, %ecx + pushl %ebx FRAME_pushl() + + movl PARAM_SRC1, %eax + + movl PARAM_SRC2, %ebx + cmpl $1, %ecx + + movl PARAM_DST, %edx + ja L(two_or_more) + + + movl (%ebx), %ecx + popl %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') + movl %ecx, (%edx) + + ret + + +L(two_or_more): + C eax src1 + C ebx src2 + C ecx size + C edx dst + C esi + C edi + C ebp + + pushl %esi FRAME_pushl() + testl $4, %eax + jz L(alignment_ok) + + movl (%ebx), %esi + addl $4, %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)') + M4_i (%eax), %esi + addl $4, %eax +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)') + movl %esi, (%edx) + addl $4, %edx + decl %ecx + +L(alignment_ok): + movl %ecx, %esi + shrl %ecx + jnz L(still_two_or_more) + + movl (%ebx), %ecx + popl %esi +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') + popl %ebx + movl %ecx, (%edx) + ret + + +L(still_two_or_more): +ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` + pcmpeqd %mm7, %mm7 C all ones +ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails +') + + ALIGN(16) +L(top): + C eax src1 + C ebx src2 + C ecx counter + C edx dst + C esi + C edi + C ebp + C + C carry bit is low of size + + movq -8(%ebx,%ecx,8), %mm0 +ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') + M4_p -8(%eax,%ecx,8), %mm0 +ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') + movq %mm0, -8(%edx,%ecx,8) + + loop L(top) + + + jnc L(no_extra) + + movl -4(%ebx,%esi,4), %ebx +ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)') + M4_i -4(%eax,%esi,4), %ebx +ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)') + movl %ebx, -4(%edx,%esi,4) +L(no_extra): + + popl %esi + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm b/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm new file mode 100644 index 0000000..45be582 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm @@ -0,0 +1,130 @@ +dnl AMD K6 mpn_lshift -- mpn left shift. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 3.0 cycles/limb + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shldl( %cl, %edx, %eax) C return value + + shll %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f + nop C avoid bad cache line crossing +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx,%eax,4), %edx C src high limb + negl %ecx + + movd PARAM_SHIFT, %mm6 + addl $32, %ecx C 32-shift + + shrl %cl, %edx + + movd %ecx, %mm7 + movl PARAM_DST, %ecx + +L(top): + C eax counter, size-1 to 1 + C ebx src + C ecx dst + C edx retval + C + C mm0 scratch + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%ecx,%eax,4) + jnz L(top) + + + movd (%ebx), %mm0 + popl %ebx + + psllq %mm6, %mm0 + movl %edx, %eax + + movd %mm0, (%ecx) + + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm b/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm new file mode 100644 index 0000000..2b19d0b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm @@ -0,0 +1,236 @@ +dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and +dnl hamming distance. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C popcount hamdist +C K6-2: 9.0 11.5 cycles/limb +C K6: 12.5 13.0 + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here isn't optimal, but it's already a 2x speedup over the plain +C integer mpn/generic/popcount.c,hamdist.c. + + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist +')m4exit(1)')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + RODATA + ALIGN(8) + +L(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +L(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +L(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F + +L(rodata_000000FF000000FF): + .long 0x000000FF + .long 0x000000FF +') + + TEXT + ALIGN(32) + +POP(`ifdef(`PIC', ` + C avoid shrl crossing a 32-byte boundary + nop')') + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + movl $0x000000FF, %edx + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + punpckldq %mm4, %mm4 +',` + + movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq L(rodata_3333333333333333), %mm6 + movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 + movq L(rodata_000000FF000000FF), %mm4 +') + +define(REG_AAAAAAAAAAAAAAAA, %mm7) +define(REG_3333333333333333, %mm6) +define(REG_0F0F0F0F0F0F0F0F, %mm5) +define(REG_000000FF000000FF, %mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + +Zdisp( movd, 0,(%eax,%ecx,8), %mm1) + +HAM(` +Zdisp( movd, 0,(%edx,%ecx,8), %mm0) + pxor %mm0, %mm1 +') + + incl %ecx + jmp L(loaded) + + + ALIGN(16) +POP(` nop C alignment to avoid crossing 32-byte boundaries') + +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 +HAM(` pxor -8(%edx,%ecx,8), %mm1') + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 +HAM(` nop C code alignment') + + psubd %mm1, %mm0 C bit pairs +HAM(` nop C code alignment') + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + movq %mm0, %mm1 + psrlq $8, %mm0 + + + paddb %mm1, %mm0 C words + + + movq %mm0, %mm1 + psrlq $16, %mm0 + + paddd %mm1, %mm0 C dwords + + pand REG_000000FF000000FF, %mm0 + + paddd %mm0, %mm2 C low to total + psrlq $32, %mm0 + + paddd %mm0, %mm2 C high to total + loop L(top) + + + + movd %mm2, %eax + emms_or_femms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm b/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm new file mode 100644 index 0000000..cd0382f --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm @@ -0,0 +1,130 @@ +dnl AMD K6 mpn_rshift -- mpn right shift. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 3.0 cycles/limb + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shrdl( %cl, %edx, %eax) C return value + + shrl %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx), %edx C src low limb + negl %ecx + + addl $32, %ecx C 32-shift + movd PARAM_SHIFT, %mm6 + + shll %cl, %edx C retval + movl PARAM_DST, %ecx + + leal (%ebx,%eax,4), %ebx + + leal -4(%ecx,%eax,4), %ecx + negl %eax + + +L(simple): + C eax counter (negative) + C ebx &src[size-1] + C ecx &dst[size-1] + C edx retval + C + C mm0 scratch + C mm6 shift + +Zdisp( movq, 0,(%ebx,%eax,4), %mm0) + incl %eax + + psrlq %mm6, %mm0 + +Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + jnz L(simple) + + + movq %mm0, (%ecx) + movl %edx, %eax + + popl %ebx + + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm new file mode 100644 index 0000000..7e30503 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/mod_34lsub1.asm @@ -0,0 +1,190 @@ +dnl AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 2.66 cycles/limb + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C An attempt was made to use a loop like +C +C L(top): +C adcl (%edx), %eax +C adcl 4(%edx), %ebx +C adcl 8(%edx), %esi +C leal 12(%edx), %edx +C loop L(top) +C +C with %ecx starting from floor(size/3), but it still measured 2.66 c/l. +C The form used instead can save about 6 cycles by not dividing by 3. +C +C In the code used, putting the "leal"s at the top of the loop is necessary +C for the claimed speed, anywhere else costs an extra cycle per loop. +C Perhaps a tight loop like this needs short decode instructions at the +C branch target, which would explain the leal/loop form above taking 8 +C cycles instead of 7 too. + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EBX, `PARAM_SIZE') +define(SAVE_ESI, `PARAM_SRC') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + + subl $2, %eax + ja L(three_or_more) + +Zdisp( movl, 0,(%edx), %eax) C avoid code cache line boundary + jne L(one) + + movl %eax, %ecx + movl 4(%edx), %edx + + shrl $24, %eax C src[0] high + andl $0x00FFFFFF, %ecx C src[0] low + + addl %ecx, %eax + movl %edx, %ecx + + shll $8, %edx + andl $0x00FFFF00, %edx C src[1] high + + shrl $16, %ecx C src[1] low + addl %ecx, %eax + + addl %edx, %eax + +L(one): + ret + + +L(three_or_more): + C eax size-2 + C ebx + C ecx + C edx src + + movl %ebx, SAVE_EBX + xorl %ebx, %ebx + + movl %esi, SAVE_ESI + pushl %edi FRAME_pushl() + + xorl %esi, %esi + xorl %edi, %edi C and clear carry flag + +L(top): + C eax counter, limbs + C ebx acc 0mod3 + C ecx + C edx src, incrementing + C esi acc 1mod3 + C edi acc 2mod3 + C ebp + + leal -2(%eax), %eax + leal 12(%edx), %edx + + adcl -12(%edx), %ebx + adcl -8(%edx), %esi + adcl -4(%edx), %edi + + decl %eax + jg L(top) + + + C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively + + movb $0, %cl + incl %eax + + js L(combine) C 0 more + +Zdisp( adcl, 0,(%edx), %ebx) C avoid code cache line crossings + + movb $8, %cl + decl %eax + + js L(combine) C 1 more + + adcl 4(%edx), %esi + + movb $16, %cl + + +L(combine): + sbbl %edx, %edx + + shll %cl, %edx C carry + movl %ebx, %eax C 0mod3 + + shrl $24, %eax C 0mod3 high + andl $0x00FFFFFF, %ebx C 0mod3 low + + subl %edx, %eax C apply carry + movl %esi, %ecx C 1mod3 + + shrl $16, %esi C 1mod3 high + addl %ebx, %eax C apply 0mod3 low + + andl $0x0000FFFF, %ecx + addl %esi, %eax C apply 1mod3 high + + shll $8, %ecx C 1mod3 low + movl %edi, %edx C 2mod3 + + shrl $8, %edx C 2mod3 high + addl %ecx, %eax C apply 1mod3 low + + addl %edx, %eax C apply 2mod3 high + andl $0x000000FF, %edi + + shll $16, %edi C 2mod3 low + movl SAVE_EBX, %ebx + + addl %edi, %eax C apply 2mod3 low + movl SAVE_ESI, %esi + + popl %edi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k6/mode1o.asm b/gmp-6.3.0/mpn/x86/k6/mode1o.asm new file mode 100644 index 0000000..4a338bd --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k6/mode1o.asm @@ -0,0 +1,176 @@ +dnl AMD K6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2000-2003, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K6: 10.0 cycles/limb + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C A special case for high31),1, +eval((UNROLL_COUNT-31)*4), +0)) + + C eax + C ebx &src[size] + C ecx + C edx + C esi carry + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + movl %esi, (%edi) + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx +ifelse(OFFSET,0,, +` subl $OFFSET, %ebx') + + shll $4, %ecx +ifelse(OFFSET,0,, +` subl $OFFSET, %edi') + + negl %ecx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + + + C The calculated jump mustn't be before the start of the available + C code. This is the limitation UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + C + ASSERT(ae,` + movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx + ') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx &src[size], constant + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi high limb to store + C edi dst ptr, high of last addmul + C ebp + + movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand + + mull %ebp + + testb $1, %cl + + movl %edx, %esi C high carry + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + movl %edx, VAR_JMP + leal 4(%edi), %edi + + C A branch-free version of this using some xors was found to be a + C touch slower than just a conditional jump, despite the jump + C switching between taken and not taken on every loop. + +ifelse(eval(UNROLL_COUNT%2),0, + jz,jnz) L(unroll_noswap) + movl %esi, %eax C high,low carry other way around + + movl %ecx, %esi + movl %eax, %ecx +L(unroll_noswap): + + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/esi should start. + C + C An attempt was made at padding here to get the end of the unrolled + C code to come out on a good alignment, to save padding before + C L(corner). This worked, but turned out to run slower than just an + C ALIGN(2). The reason for this is not clear, it might be related + C to the different speeds on different UNROLL_COUNTs noted above. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx src + C ecx carry low + C edx scratch + C esi carry high + C edi dst + C ebp multiplier + C + C 15 code bytes each limb + C ecx/esi swapped on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src - 4)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%ebx), %eax) + mull %ebp +Zdisp( addl, %esi, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%ebx), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) +') +') +L(unroll_inner_end): + + addl %esi, -4+OFFSET(%edi) + + movl VAR_COUNTER, %edx + jadcl0( %ecx) + + movl %ecx, m4_empty_if_zero(OFFSET)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %ebx + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C ebx &src[size] + C edi &dst[2*size-5] + + movl -12(%ebx), %ebp + + movl -8(%ebx), %eax + movl %eax, %ecx + + mull %ebp + + addl %eax, -4(%edi) + adcl $0, %edx + + movl -4(%ebx), %eax + movl %edx, %esi + movl %eax, %ebx + + mull %ebp + + addl %esi, %eax + adcl $0, %edx + + addl %eax, (%edi) + adcl $0, %edx + + movl %edx, %esi + movl %ebx, %eax + + mull %ecx + + addl %esi, %eax + movl %eax, 4(%edi) + + adcl $0, %edx + + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. +C The loop measures about 6 cycles/iteration, though it looks like it should +C decode in 5. + +L(lshift_start): + movl PARAM_SIZE, %ecx + + movl PARAM_DST, %edi + subl $1, %ecx C size-1 and clear carry + + movl PARAM_SRC, %ebx + movl %ecx, %edx + + xorl %eax, %eax C ready for adcl + + + ALIGN(16) +L(lshift): + C eax + C ebx src (for later use) + C ecx counter, decrementing + C edx size-1 (for later use) + C esi + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + leal 8(%edi), %edi + loop L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%ebx), %eax C src[0] + + leal 4(%ebx,%edx,4), %ebx C &src[size] + subl %edx, %ecx C -(size-1) + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + + ALIGN(16) +L(diag): + C eax scratch + C ebx &src[size] + C ecx counter, negative + C edx carry + C esi scratch + C edi dst[2*size-2] + C ebp + + movl (%ebx,%ecx,4), %eax + movl %edx, %esi + + mull %eax + + addl %esi, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret_internal +') + + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/README b/gmp-6.3.0/mpn/x86/k7/README new file mode 100644 index 0000000..5711b61 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/README @@ -0,0 +1,174 @@ +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + AMD K7 MPN SUBROUTINES + + +This directory contains code optimized for the AMD Athlon CPU. + +The mmx subdirectory has routines using MMX instructions. All Athlons have +MMX, the separate directory is just so that configure can omit it if the +assembler doesn't support MMX. + + + +STATUS + +Times for the loops, with all code and data in L1 cache. + + cycles/limb + mpn_add/sub_n 1.6 + + mpn_copyi 0.75 or 1.0 \ varying with data alignment + mpn_copyd 0.75 or 1.0 / + + mpn_divrem_1 17.0 integer part, 15.0 fractional part + mpn_mod_1 17.0 + mpn_divexact_by3 8.0 + + mpn_l/rshift 1.2 + + mpn_mul_1 3.4 + mpn_addmul/submul_1 3.9 + + mpn_mul_basecase 4.42 cycles/crossproduct (approx) + mpn_sqr_basecase 2.3 cycles/crossproduct (approx) + or 4.55 cycles/triangleproduct (approx) + +Prefetching of sources hasn't yet been tried. + + + +NOTES + +cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available. + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Floating point multiplications can be done in parallel with integer +multiplications, but there doesn't seem to be any way to make use of this. + +Unsigned "mul"s can be issued every 3 cycles. This suggests 3 is a limit on +the speed of the multiplication routines. The documentation shows mul +executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that, +to get near 3 cycles code has to be arranged so that nothing else is issued +to IEU0. A busy IEU0 could explain why some code takes 4 cycles and other +apparently equivalent code takes 5. + + + +OPTIMIZATIONS + +Unrolled loops are used to reduce looping overhead. The unrolling is +configurable up to 32 limbs/loop for most routines and up to 64 for some. +The K7 has 64k L1 code cache so quite big unrolling is allowable. + +Computed jumps into the unrolling are used to handle sizes not a multiple of +the unrolling. An attractive feature of this is that times increase +smoothly with operand size, but it may be that some routines should just +have simple loops to finish up, especially when PIC adds between 2 and 16 +cycles to get %eip. + +Position independent code is implemented using a call to get %eip for the +computed jumps and a ret is always done, rather than an addl $4,%esp or a +popl, so the CPU return address branch prediction stack stays synchronised +with the actual stack in memory. + +Branch prediction, in absence of any history, will guess forward jumps are +not taken and backward jumps are taken. Where possible it's arranged that +the less likely or less important case is under a taken forward jump. + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three direct-path instructions which have no +successive dependencies. K7 always decodes three and has out-of-order +execution, but the groupings show what slots might be available and what +dependency chains exist. + +When there's vector-path instructions an effort is made to get triplets of +direct-path instructions in between them, even if there's dependencies, +since this maximizes decoding throughput and might save a cycle or two if +decoding is the limiting factor. + + + +INSTRUCTIONS + +adcl direct +divl 39 cycles back-to-back +lodsl,etc vector +loop 1 cycle vector (decl/jnz opens up one decode slot) +movd reg vector +movd mem direct +mull issue every 3 cycles, latency 4 cycles low word, 6 cycles high word +popl vector (use movl for more than one pop) +pushl direct, will pair with a load +shrdl %cl vector, 3 cycles, seems to be 3 decode too +xorl r,r false read dependency recognised + + + +REFERENCES + +"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number +22007, revision K, February 2002. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22007.pdf + +"3DNow Technology Manual", AMD publication number 21928G/0-March 2000. +This describes the femms and prefetch instructions. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf + +"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD +publication number 22466, revision D, March 2000. This describes +instructions added in the Athlon processor, such as pswapd and the extra +prefetch forms. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22466.pdf + +"3DNow Instruction Porting Guide", AMD publication number 22621, revision B, +August 1999. This has some notes on general Athlon optimizations as well as +3DNow. Available on-line, + +http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf + + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm b/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm new file mode 100644 index 0000000..2cba1eb --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/addlsh1_n.asm @@ -0,0 +1,196 @@ +dnl AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns. +C The innerloop is 2*3-way unrolled, which is best we can do with the available +C registers. It seems tricky to use the same structure for rsblsh1_n, since we +C cannot feed carry between operations there. + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) 5.4 (worse than add_n + lshift) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 6 +C AMD K6 ? +C AMD K7 2.5 +C AMD K8 + +C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32 +C processors. It uses 2*3-way unrolling, for good reasons. Unfortunately, +C that means we need an initial magic multiply. +C +C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern. We +C cannot do rsblsh1_n since we feed carry from the shift blocks to the +C add/subtract blocks, which is right for addition but reversed for +C subtraction. We could perhaps do sublsh1_n, with some extra move insns, +C without losing any time, since we're not issue limited but carry recurrency +C latency. +C +C Breaking carry recurrency might be a good idea. We would then need separate +C registers for the shift carry and add/subtract carry, which in turn would +C force us to 2*2-way unrolling. + +defframe(PARAM_SIZE, 16) +defframe(PARAM_DBLD, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_COUNT,`PARAM_DST') +define(VAR_TMP,`PARAM_DBLD') + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(mpn_addlsh1_n) +deflit(`FRAME',0) + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebp') + + mov $0x2aaaaaab, %eax + + push %ebx FRAME_pushl() + mov PARAM_SIZE, %ebx C size + + push rp FRAME_pushl() + mov PARAM_DST, rp + + mul %ebx + + push up FRAME_pushl() + mov PARAM_SRC, up + + not %edx C count = -(size\8)-1 + mov %edx, VAR_COUNT + + push vp FRAME_pushl() + mov PARAM_DBLD, vp + + lea 3(%edx,%edx,2), %ecx C count*3+3 = -(size\6)*3 + xor %edx, %edx + lea (%ebx,%ecx,2), %ebx C size + (count*3+3)*2 = size % 6 + or %ebx, %ebx + jz L(exact) + +L(oop): +ifdef(`CPU_P6',` + shr %edx ') C restore 2nd saved carry bit + mov (vp), %eax + adc %eax, %eax + rcr %edx C restore 1st saved carry bit + lea 4(vp), vp + adc (up), %eax + lea 4(up), up + adc %edx, %edx C save a carry bit in edx +ifdef(`CPU_P6',` + adc %edx, %edx ') C save another carry bit in edx + dec %ebx + mov %eax, (rp) + lea 4(rp), rp + jnz L(oop) + mov vp, VAR_TMP +L(exact): + incl VAR_COUNT + jz L(end) + + ALIGN(16) +L(top): +ifdef(`CPU_P6',` + shr %edx ') C restore 2nd saved carry bit + mov (vp), %eax + adc %eax, %eax + mov 4(vp), %ebx + adc %ebx, %ebx + mov 8(vp), %ecx + adc %ecx, %ecx + + rcr %edx C restore 1st saved carry bit + + adc (up), %eax + mov %eax, (rp) + adc 4(up), %ebx + mov %ebx, 4(rp) + adc 8(up), %ecx + mov %ecx, 8(rp) + + mov 12(vp), %eax + adc %eax, %eax + mov 16(vp), %ebx + adc %ebx, %ebx + mov 20(vp), %ecx + adc %ecx, %ecx + + lea 24(vp), vp + adc %edx, %edx C save a carry bit in edx + + adc 12(up), %eax + mov %eax, 12(rp) + adc 16(up), %ebx + mov %ebx, 16(rp) + adc 20(up), %ecx + + lea 24(up), up + +ifdef(`CPU_P6',` + adc %edx, %edx ') C save another carry bit in edx + mov %ecx, 20(rp) + incl VAR_COUNT + lea 24(rp), rp + jne L(top) + +L(end): + pop vp FRAME_popl() + pop up FRAME_popl() + +ifdef(`CPU_P6',` + xor %eax, %eax + shr $1, %edx + adc %edx, %eax +',` + adc $0, %edx + mov %edx, %eax +') + pop rp FRAME_popl() + pop %ebx FRAME_popl() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/k7/aors_n.asm b/gmp-6.3.0/mpn/x86/k7/aors_n.asm new file mode 100644 index 0000000..1a08072 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/aors_n.asm @@ -0,0 +1,258 @@ +dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. + +dnl Copyright 1999-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 1.64 cycles/limb (at 16 limbs/loop). + + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 8 1.9 +dnl 16 1.64 +dnl 32 1.7 +dnl 64 2.0 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_add_n', ` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + define(M4_description, add) +',`ifdef(`OPERATION_sub_n', ` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + define(M4_description, subtract) +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C Calculate src1,size M4_description src2,size, and store the result in +C dst,size. The return value is the carry bit from the top of the result (1 +C or 0). +C +C The _nc version accepts 1 or 0 for an initial carry into the low limb of +C the calculation. Note values other than 1 or 0 here will lead to garbage +C results. +C +C This code runs at 1.64 cycles/limb, which might be the best possible with +C plain integer operations. Each limb is 2 loads and 1 store, any 2 of +C which can be done each cycle, leading to 1.5 c/l. + +dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 8) +',` +deflit(UNROLL_THRESHOLD, 8) +') + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBP, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +defframe(SAVE_EDI, -16) +deflit(STACK_SPACE, 16) + + TEXT + ALIGN(32) +deflit(`FRAME',0) + +PROLOGUE(M4_function_nc) + movl PARAM_CARRY, %eax + jmp L(start) +EPILOGUE() + +PROLOGUE(M4_function_n) + + xorl %eax, %eax C carry +L(start): + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %edi, SAVE_EDI + movl %ebx, SAVE_EBX + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_SRC2, %edx + movl PARAM_SRC1, %ebx + jae L(unroll) + + movl PARAM_DST, %edi + leal (%ebx,%ecx,4), %ebx + leal (%edx,%ecx,4), %edx + + leal (%edi,%ecx,4), %edi + negl %ecx + shrl %eax + + C This loop in in a single 16 byte code block already, so no + C alignment necessary. +L(simple): + C eax scratch + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + + movl (%ebx,%ecx,4), %eax + M4_inst (%edx,%ecx,4), %eax + movl %eax, (%edi,%ecx,4) + incl %ecx + jnz L(simple) + + movl $0, %eax + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + setc %al + addl $STACK_SPACE, %esp + + ret + + +C ----------------------------------------------------------------------------- + C This is at 0x55, close enough to aligned. +L(unroll): +deflit(`FRAME',STACK_SPACE) + movl %ebp, SAVE_EBP + andl $-2, %ecx C size low bit masked out + andl $1, PARAM_SIZE C size low bit kept + + movl %ecx, %edi + decl %ecx + movl PARAM_DST, %ebp + + shrl $UNROLL_LOG2, %ecx + negl %edi + movl %esi, SAVE_ESI + + andl $UNROLL_MASK, %edi + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edi,%edi,8), %esi C 9 bytes per +') + negl %edi + shrl %eax + + leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx + leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx + leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi + + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%edi,%edi,8), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + ret_internal +') + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax zero + C ebx src1 + C ecx counter + C edx src2 + C esi scratch (was computed jump) + C edi dst + C ebp scratch + + leal UNROLL_BYTES(%edx), %edx + +L(entry): +deflit(CHUNK_COUNT, 2) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %esi) + movl disp1(%ebx), %ebp +Zdisp( M4_inst,disp0,(%edx), %esi) +Zdisp( movl, %esi, disp0,(%edi)) + M4_inst disp1(%edx), %ebp + movl %ebp, disp1(%edi) +') + + decl %ecx + leal UNROLL_BYTES(%ebx), %ebx + leal UNROLL_BYTES(%edi), %edi + jns L(top) + + + mov PARAM_SIZE, %esi + movl SAVE_EBP, %ebp + movl $0, %eax + + decl %esi + js L(even) + + movl (%ebx), %ecx + M4_inst UNROLL_BYTES(%edx), %ecx + movl %ecx, (%edi) +L(even): + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + setc %al + + movl SAVE_ESI, %esi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm b/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm new file mode 100644 index 0000000..eec8df6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/aorsmul_1.asm @@ -0,0 +1,167 @@ +dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. + +dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) 6.5 +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 +C AMD K7 3.75 +C AMD K8 + +C TODO +C * Improve feed-in and wind-down code. We beat the old code for all n != 1, +C but lose by 2x for n == 1. + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + add $-16, %esp + mov %ebp, (%esp) + mov %ebx, 4(%esp) + mov %esi, 8(%esp) + mov %edi, 12(%esp) + + mov 20(%esp), %edi + mov 24(%esp), %esi + mov 28(%esp), %eax + mov 32(%esp), %ecx + mov %eax, %ebx + shr $2, %eax + mov %eax, 28(%esp) + mov (%esi), %eax + and $3, %ebx + jz L(b0) + cmp $2, %ebx + jz L(b2) + jg L(b3) + +L(b1): lea -4(%esi), %esi + lea -4(%edi), %edi + mul %ecx + mov %eax, %ebx + mov %edx, %ebp + cmpl $0, 28(%esp) + jz L(cj1) + mov 8(%esi), %eax + jmp L(1) + +L(b2): mul %ecx + mov %eax, %ebp + mov 4(%esi), %eax + mov %edx, %ebx + cmpl $0, 28(%esp) + jne L(2) + jmp L(cj2) + +L(b3): lea -12(%esi), %esi + lea -12(%edi), %edi + mul %ecx + mov %eax, %ebx + mov %edx, %ebp + mov 16(%esi), %eax + incl 28(%esp) + jmp L(3) + +L(b0): lea -8(%esi), %esi + lea -8(%edi), %edi + mul %ecx + mov %eax, %ebp + mov 12(%esi), %eax + mov %edx, %ebx + jmp L(0) + + ALIGN(16) +L(top): lea 16(%edi), %edi +L(2): mul %ecx + ADDSUB %ebp, 0(%edi) + mov $0, %ebp + adc %eax, %ebx + mov 8(%esi), %eax + adc %edx, %ebp +L(1): mul %ecx + ADDSUB %ebx, 4(%edi) + mov $0, %ebx + adc %eax, %ebp + mov 12(%esi), %eax + adc %edx, %ebx +L(0): mul %ecx + ADDSUB %ebp, 8(%edi) + mov $0, %ebp + adc %eax, %ebx + adc %edx, %ebp + mov 16(%esi), %eax +L(3): mul %ecx + ADDSUB %ebx, 12(%edi) + adc %eax, %ebp + mov 20(%esi), %eax + lea 16(%esi), %esi + mov $0, %ebx + adc %edx, %ebx + decl 28(%esp) + jnz L(top) + +L(end): lea 16(%edi), %edi +L(cj2): mul %ecx + ADDSUB %ebp, (%edi) + adc %eax, %ebx + adc $0, %edx +L(cj1): ADDSUB %ebx, 4(%edi) + adc $0, %edx + mov %edx, %eax + mov (%esp), %ebp + mov 4(%esp), %ebx + mov 8(%esp), %esi + mov 12(%esp), %edi + add $16, %esp + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm new file mode 100644 index 0000000..2af7bb9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/bdiv_q_1.asm @@ -0,0 +1,245 @@ +dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division. + +dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Athlon: 11.0 +C Hammer: 9.0 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C The dependent chain is mul+imul+sub for 11 cycles and that speed is +C achieved with no special effort. The load and shrld latencies are hidden +C by out of order execution. +C +C It's a touch faster on size==1 to use the mul-by-inverse than divl. + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_INVERSE, -20) +defframe(VAR_DST_END, -24) + +deflit(STACK_SPACE, 24) + + TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + ALIGN(16) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + movl PARAM_SHIFT, %ecx C shift count + + movl %ebp, SAVE_EBP + movl PARAM_SIZE, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebx, SAVE_EBX + + leal (%esi,%ebp,4), %esi C src end + leal (%edi,%ebp,4), %edi C dst end + negl %ebp C -size + + movl PARAM_INVERSE, %eax C inv + +L(common): + movl %eax, VAR_INVERSE + movl (%esi,%ebp,4), %eax C src[0] + + incl %ebp + jz L(one) + + movl (%esi,%ebp,4), %edx C src[1] + + shrdl( %cl, %edx, %eax) + + movl %edi, VAR_DST_END + xorl %ebx, %ebx + jmp L(entry) + + ALIGN(8) +L(top): + C eax q + C ebx carry bit, 0 or 1 + C ecx shift + C edx + C esi src end + C edi dst end + C ebp counter, limbs, negative + + mull PARAM_DIVISOR C carry limb in edx + + movl -4(%esi,%ebp,4), %eax + movl (%esi,%ebp,4), %edi + + shrdl( %cl, %edi, %eax) + + subl %ebx, %eax C apply carry bit + setc %bl + movl VAR_DST_END, %edi + + subl %edx, %eax C apply carry limb + adcl $0, %ebx + +L(entry): + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi,%ebp,4) + incl %ebp + jnz L(top) + + + mull PARAM_DIVISOR C carry limb in edx + + movl -4(%esi), %eax C src high limb + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebx, %eax C apply carry bit + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + + subl %edx, %eax C apply carry limb + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +L(one): + shrl %cl, %eax + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl SAVE_EBP, %ebp + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret +EPILOGUE() + +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C + + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + movl $-1, %ecx C shift count + + movl %ebp, SAVE_EBP + movl PARAM_SIZE, %ebp + + movl %esi, SAVE_ESI + movl %edi, SAVE_EDI + + C If there's usually only one or two trailing zero bits then this + C should be faster than bsfl. +L(strip_twos): + incl %ecx + shrl %eax + jnc L(strip_twos) + + movl %ebx, SAVE_EBX + leal 1(%eax,%eax), %ebx C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %edx) + movzbl (%eax,%edx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + leal (%eax,%eax), %edx C 2*inv + movl %ebx, PARAM_DIVISOR C d without twos + + imull %eax, %eax C inv*inv + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + + imull %ebx, %eax C inv*inv*d + + subl %eax, %edx C inv = 2*inv - inv*inv*d + leal (%edx,%edx), %eax C 2*inv + + imull %edx, %edx C inv*inv + + leal (%esi,%ebp,4), %esi C src end + leal (%edi,%ebp,4), %edi C dst end + negl %ebp C -size + + imull %ebx, %edx C inv*inv*d + + subl %edx, %eax C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + jmp L(common) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/k7/dive_1.asm b/gmp-6.3.0/mpn/x86/k7/dive_1.asm new file mode 100644 index 0000000..458bd02 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/dive_1.asm @@ -0,0 +1,208 @@ +dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Athlon: 11.0 +C Hammer: 9.0 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C The dependent chain is mul+imul+sub for 11 cycles and that speed is +C achieved with no special effort. The load and shrld latencies are hidden +C by out of order execution. +C +C It's a touch faster on size==1 to use the mul-by-inverse than divl. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_INVERSE, -20) +defframe(VAR_DST_END, -24) + +deflit(STACK_SPACE, 24) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + movl $-1, %ecx C shift count + + movl %ebp, SAVE_EBP + movl PARAM_SIZE, %ebp + + movl %esi, SAVE_ESI + movl %edi, SAVE_EDI + + C If there's usually only one or two trailing zero bits then this + C should be faster than bsfl. +L(strip_twos): + incl %ecx + shrl %eax + jnc L(strip_twos) + + movl %ebx, SAVE_EBX + leal 1(%eax,%eax), %ebx C d without twos + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %edx) + movzbl (%eax,%edx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + leal (%eax,%eax), %edx C 2*inv + movl %ebx, PARAM_DIVISOR C d without twos + + imull %eax, %eax C inv*inv + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + + imull %ebx, %eax C inv*inv*d + + subl %eax, %edx C inv = 2*inv - inv*inv*d + leal (%edx,%edx), %eax C 2*inv + + imull %edx, %edx C inv*inv + + leal (%esi,%ebp,4), %esi C src end + leal (%edi,%ebp,4), %edi C dst end + negl %ebp C -size + + imull %ebx, %edx C inv*inv*d + + subl %edx, %eax C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + movl (%esi,%ebp,4), %eax C src[0] + + incl %ebp + jz L(one) + + movl (%esi,%ebp,4), %edx C src[1] + + shrdl( %cl, %edx, %eax) + + movl %edi, VAR_DST_END + xorl %ebx, %ebx + jmp L(entry) + + ALIGN(8) +L(top): + C eax q + C ebx carry bit, 0 or 1 + C ecx shift + C edx + C esi src end + C edi dst end + C ebp counter, limbs, negative + + mull PARAM_DIVISOR C carry limb in edx + + movl -4(%esi,%ebp,4), %eax + movl (%esi,%ebp,4), %edi + + shrdl( %cl, %edi, %eax) + + subl %ebx, %eax C apply carry bit + setc %bl + movl VAR_DST_END, %edi + + subl %edx, %eax C apply carry limb + adcl $0, %ebx + +L(entry): + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi,%ebp,4) + incl %ebp + jnz L(top) + + + mull PARAM_DIVISOR C carry limb in edx + + movl -4(%esi), %eax C src high limb + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebx, %eax C apply carry bit + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + + subl %edx, %eax C apply carry limb + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + + +L(one): + shrl %cl, %eax + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl SAVE_EBP, %ebp + movl %eax, -4(%edi) + + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/k7/gcd_11.asm b/gmp-6.3.0/mpn/x86/k7/gcd_11.asm new file mode 100644 index 0000000..2648dfd --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/gcd_11.asm @@ -0,0 +1,107 @@ +dnl x86 mpn_gcd_11 optimised for AMD K7. + +dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K7 5.31 +C AMD K8,K9 5.33 +C AMD K10 5.30 +C AMD bd1 ? +C AMD bobcat 7.02 +C Intel P4-2 10.1 +C Intel P4-3/4 10.0 +C Intel P6/13 5.88 +C Intel core2 6.26 +C Intel NHM 6.83 +C Intel SBR 8.50 +C Intel atom 8.90 +C VIA nano ? +C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 6) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + + +define(`u0', `%eax') +define(`v0', `%edx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + push %edi + push %esi + + mov 12(%esp), %eax + mov 16(%esp), %edx + + LEAL( ctz_table, %esi) + jmp L(odd) + + ALIGN(16) C +L(top): cmovc( %ecx, %eax) C u = |v - u| + cmovc( %edi, %edx) C v = min(u,v) +L(mid): and $MASK, %ecx C + movzbl (%esi,%ecx), %ecx C + jz L(shift_alot) C + shr %cl, %eax C +L(odd): mov %eax, %edi C + mov %edx, %ecx C + sub %eax, %ecx C + sub %edx, %eax C + jnz L(top) C + +L(end): mov %edx, %eax + pop %esi + pop %edi + ret + +L(shift_alot): + shr $MAXSHIFT, %eax + mov %eax, %ecx + jmp L(mid) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h b/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h new file mode 100644 index 0000000..a09507d --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/gmp-mparam.h @@ -0,0 +1,263 @@ +/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2083 MHz K7 Barton */ +/* FFT tuning limit = 49,770,069 */ +/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 24 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +/* From mati.gmplib.org, 2023-07-21 */ +#define DIV_QR_1N_PI1_METHOD 3 /* 9.52% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 26 + +#define DIV_1_VS_MUL_1_PERCENT 182 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 85 +#define MUL_TOOM44_THRESHOLD 154 +#define MUL_TOOM6H_THRESHOLD 208 +#define MUL_TOOM8H_THRESHOLD 309 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 102 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 121 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 50 +#define SQR_TOOM3_THRESHOLD 86 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 270 +#define SQR_TOOM8_THRESHOLD 446 + +#define MULMID_TOOM42_THRESHOLD 50 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 606, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 63, 7}, { 127, 8}, { 71, 9}, { 39, 6}, \ + { 319, 9}, { 47, 8}, { 99, 6}, { 399, 9}, \ + { 55,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 79,10}, { 47, 9}, { 95, 8}, { 191, 4}, \ + { 3135, 5}, { 1599, 4}, { 3455, 6}, { 959, 8}, \ + { 247,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 199,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 159, 9}, { 319, 8}, { 639, 7}, { 1279,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 799,10}, \ + { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511, 8}, { 1023,10}, { 271, 9}, { 543, 8}, \ + { 1087, 9}, { 575,11}, { 159, 9}, { 639,10}, \ + { 335, 9}, { 671, 8}, { 1343,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383, 9}, { 799, 8}, \ + { 1599,11}, { 223,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ + { 1087,10}, { 575, 9}, { 1151,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 639, 9}, { 1343,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 383,10}, \ + { 767, 9}, { 1535,10}, { 799, 9}, { 1599,10}, \ + { 831, 9}, { 1727, 8}, { 3455,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023, 9}, \ + { 2047,11}, { 543,10}, { 1087,11}, { 575,10}, \ + { 1151, 9}, { 2303,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \ + { 1343,11}, { 703,10}, { 1407,11}, { 735,10}, \ + { 1471, 9}, { 2943,12}, { 383,11}, { 767,10}, \ + { 1535,11}, { 799,10}, { 1599,11}, { 831,10}, \ + { 1663,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 895,10}, { 1791,11}, { 959,10}, { 1919,13}, \ + { 255,12}, { 511,11}, { 1023,10}, { 2111,11}, \ + { 1087,10}, { 2175,12}, { 575,11}, { 1151,10}, \ + { 2303,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1407,10}, { 2815,11}, \ + { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1663,10}, { 3327,11}, \ + { 1727,10}, { 3455,12}, { 895,11}, { 1855,12}, \ + { 959,11}, { 1919,10}, { 3839,14}, { 255,13}, \ + { 511,12}, { 1023,11}, { 2111,12}, { 1087,11}, \ + { 2239,12}, { 1151,11}, { 2303,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,12}, \ + { 1407,11}, { 2815,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1663,11}, { 3327,12}, { 1727,11}, \ + { 3455,13}, { 895,12}, { 1919,11}, { 3839,12}, \ + { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2495,13}, { 1279,12}, { 2687,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,12}, { 3455,13}, { 1791,12}, \ + { 3583,13}, { 1919,12}, { 3967,15}, { 511,14}, \ + { 1023,13}, { 2047,12}, { 4095,13}, { 2175,12}, \ + { 4479,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2559,12}, { 5119,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,14}, { 2559,13}, { 5119,14}, { 2815,13}, \ + { 5887,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 254 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 492 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 492, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 135, 8}, { 271, 9}, \ + { 143,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191, 8}, { 383,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 303,10}, { 159, 9}, { 319, 8}, { 639,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 767, 9}, \ + { 399,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671, 8}, { 1343, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767, 8}, { 1535,10}, { 399, 9}, \ + { 799, 8}, { 1599, 9}, { 863,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 511, 9}, \ + { 1087,10}, { 575, 9}, { 1215,10}, { 639, 9}, \ + { 1279,10}, { 671, 9}, { 1343,11}, { 351,10}, \ + { 703, 9}, { 1407,10}, { 735, 9}, { 1471,12}, \ + { 191,11}, { 383,10}, { 767, 9}, { 1535,10}, \ + { 799, 9}, { 1599,11}, { 415,10}, { 831, 9}, \ + { 1663,10}, { 863, 9}, { 1727, 8}, { 3455,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023, 9}, { 2047,11}, { 543,10}, \ + { 1087, 9}, { 2175,11}, { 575,10}, { 1151, 9}, \ + { 2303,11}, { 607,10}, { 1215, 9}, { 2431,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \ + { 1343,11}, { 703,10}, { 1407, 9}, { 2815,11}, \ + { 735,10}, { 1471, 9}, { 2943,12}, { 383,11}, \ + { 767,10}, { 1599,11}, { 831,10}, { 1663, 9}, \ + { 3327,10}, { 1727,12}, { 447,11}, { 895,10}, \ + { 1791,11}, { 959,10}, { 1919,13}, { 255,12}, \ + { 511,11}, { 1023,10}, { 2111,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,10}, { 2303,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1407,10}, { 2815,11}, { 1471,10}, \ + { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1663,10}, { 3327,11}, { 1727,10}, \ + { 3455,12}, { 895,11}, { 1791,12}, { 959,11}, \ + { 1919,10}, { 3839,14}, { 255,13}, { 511,12}, \ + { 1023,11}, { 2111,12}, { 1087,11}, { 2239,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1343,11}, { 2687,12}, { 1407,11}, \ + { 2815,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1599,11}, { 3199,12}, { 1663,11}, { 3327,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1791,11}, \ + { 3583,12}, { 1919,11}, { 3839,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2687,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1535,12}, { 3199,13}, \ + { 1663,12}, { 3455,13}, { 1791,12}, { 3583,13}, \ + { 1919,12}, { 3967,15}, { 511,14}, { 1023,13}, \ + { 2047,12}, { 4095,13}, { 2175,12}, { 4351,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4351,14}, { 2303,13}, \ + { 4991,14}, { 2559,13}, { 5119,14}, { 2815,13}, \ + { 5887,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 258 +#define SQR_FFT_THRESHOLD 5504 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 34 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 137 +#define SQRLO_SQR_THRESHOLD 10821 + +#define DC_DIV_QR_THRESHOLD 45 +#define DC_DIVAPPR_Q_THRESHOLD 206 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 144 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 202 +#define INV_APPR_THRESHOLD 206 + +#define BINV_NEWTON_THRESHOLD 224 +#define REDC_1_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1387 +#define MUPI_DIV_QR_THRESHOLD 82 +#define MU_BDIV_QR_THRESHOLD 1308 +#define MU_BDIV_Q_THRESHOLD 1387 + +#define POWM_SEC_TABLE 1,16,102,428,1221 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 254 +#define SET_STR_PRECOMPUTE_THRESHOLD 890 + +#define FAC_DSC_THRESHOLD 206 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 3 /* 3.84% faster than 4 */ +#define HGCD_THRESHOLD 123 +#define HGCD_APPR_THRESHOLD 151 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 435 +#define GCDEXT_DC_THRESHOLD 318 +#define JACOBI_BASE_METHOD 4 /* 8.04% faster than 3 */ + +/* Tuneup completed successfully, took 175382 seconds */ diff --git a/gmp-6.3.0/mpn/x86/k7/invert_limb.asm b/gmp-6.3.0/mpn/x86/k7/invert_limb.asm new file mode 100644 index 0000000..31a867e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/invert_limb.asm @@ -0,0 +1,194 @@ +dnl x86 mpn_invert_limb + +dnl Contributed to the GNU project by Niels Möller + +dnl Copyright 2009, 2011, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles (approx) div +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) ? +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 ? +C AMD K7 41 53 +C AMD K8 ? + +C TODO +C * These c/l numbers are for a non-PIC build. Consider falling back to using +C the 'div' instruction for PIC builds. +C * Perhaps use this file--or at least the algorithm--for more machines than k7. + +C Register usage: +C Input D in %edi +C Current approximation is in %eax and/or %ecx +C %ebx and %edx are temporaries +C %esi and %ebp are unused + +defframe(PARAM_DIVISOR,4) + +ASM_START() + +C Make approx_tab global to work around Apple relocation bug. +ifdef(`DARWIN',` + deflit(`approx_tab', MPN(invert_limb_tab)) + GLOBL approx_tab') + + TEXT + ALIGN(16) +PROLOGUE(mpn_invert_limb) +deflit(`FRAME', 0) + mov PARAM_DIVISOR, %eax + C Avoid push/pop on k7. + sub $8, %esp FRAME_subl_esp(8) + mov %ebx, (%esp) + mov %edi, 4(%esp) + + mov %eax, %edi + shr $22, %eax +ifdef(`PIC',` + LEAL( approx_tab, %ebx) + movzwl -1024(%ebx, %eax, 2), %eax +',` + movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0 +') + + C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1 + mov %eax, %ecx + imul %eax, %eax + mov %edi, %ebx + shr $11, %ebx + inc %ebx + mul %ebx + mov %edi, %ebx C Prepare + shr %ebx + sbb %eax, %eax + sub %eax, %ebx C %ebx = d_31, %eax = mask + shl $4, %ecx + dec %ecx + sub %edx, %ecx C %ecx = v1 + + C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33) + imul %ecx, %ebx + and %ecx, %eax + shr %eax + sub %ebx, %eax + mul %ecx + mov %edi, %eax C Prepare for next mul + shl $15, %ecx + shr %edx + add %edx, %ecx C %ecx = v2 + + mul %ecx + add %edi, %eax + mov %ecx, %eax + adc %edi, %edx + sub %edx, %eax C %eax = v3 + + mov (%esp), %ebx + mov 4(%esp), %edi + add $8, %esp + + ret + +EPILOGUE() + +DEF_OBJECT(approx_tab,2) + .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27 + .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d + .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61 + .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894 + .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3 + .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520 + .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379 + .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de + .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e + .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8 + .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e + .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd + .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76 + .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918 + .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3 + .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676 + .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532 + .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5 + .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1 + .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193 + .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d + .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d + .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35 + .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22 + .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16 + .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10 + .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f + .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914 + .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f + .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e + .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643 + .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d + .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b + .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e + .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6 + .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1 + .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121 + .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056 + .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e + .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca + .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09 + .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d + .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93 + .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde + .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b + .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c + .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0 + .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927 + .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881 + .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de + .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e + .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1 + .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606 + .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e + .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8 + .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445 + .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5 + .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327 + .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b + .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211 + .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a + .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104 + .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081 + .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000 +END_OBJECT(approx_tab) +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/k7/mmx/com.asm b/gmp-6.3.0/mpn/x86/k7/mmx/com.asm new file mode 100644 index 0000000..a258c22 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mmx/com.asm @@ -0,0 +1,125 @@ +dnl AMD Athlon mpn_com -- mpn bitwise one's complement. + +dnl Copyright 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 1.0 cycles/limb + + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The loop form below is necessary for the claimed speed. It needs to be +C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it +C fits in a BTB entry. The adjustments to %eax and %edx avoid offsets on +C the movq's and achieve the necessary size. +C +C If both src and dst are 4mod8, the loop runs at 1.5 c/l. So long as one +C of the two is 0mod8, it runs at 1.0 c/l. On that basis dst is checked +C (offset by the size, as per the loop addressing) and one high limb +C processed separately to get alignment. +C +C The padding for the nails case is unattractive, but shouldn't cost any +C cycles. Explicit .byte's guarantee the desired instructions, at a point +C where we're probably stalled waiting for loads anyway. +C +C Enhancements: +C +C The combination load/pxor/store might be able to be unrolled to approach +C 0.5 c/l if desired. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(16) + +PROLOGUE(mpn_com) +deflit(`FRAME',0) + + movl PARAM_DST, %edx + movl PARAM_SIZE, %ecx + pcmpeqd %mm7, %mm7 + + leal (%edx,%ecx,4), %eax + andl $4, %eax +ifelse(GMP_NAIL_BITS,0,, +` psrld $GMP_NAIL_BITS, %mm7') C GMP_NUMB_MASK + + movl PARAM_SRC, %eax + movd -4(%eax,%ecx,4), %mm0 C src high limb + +ifelse(GMP_NAIL_BITS,0,, +` C padding for alignment below + .byte 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 C lea 0(%esi),%esi + .byte 0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 C lea 0(%edi),%edi +') + + jz L(aligned) + + pxor %mm7, %mm0 + movd %mm0, -4(%edx,%ecx,4) C dst high limb + decl %ecx + jz L(done) +L(aligned): + + addl $4, %eax + addl $4, %edx + decl %ecx + jz L(one) + + C offset 0x30 for no nails, or 0x40 for nails + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter + C edx dst + + subl $2, %ecx + movq (%eax,%ecx,4), %mm0 + pxor %mm7, %mm0 + movq %mm0, (%edx,%ecx,4) + jg L(top) + + jnz L(done) C if size even + +L(one): + movd -4(%eax), %mm0 C src low limb + pxor %mm7, %mm0 + movd %mm0, -4(%edx) C dst low limb + +L(done): + emms + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm b/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm new file mode 100644 index 0000000..59ece40 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mmx/copyd.asm @@ -0,0 +1,144 @@ +dnl AMD K7 mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C alignment dst/src, A=0mod8 N=4mod8 +C A/A A/N N/A N/N +C K7 0.75 1.0 1.0 0.75 + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The various comments in mpn/x86/k7/copyi.asm apply here too. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl parameter space reused +define(SAVE_EBX,`PARAM_SIZE') +define(SAVE_ESI,`PARAM_SRC') + +dnl minimum 5 since the unrolled code can't handle less than 5 +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(32) +PROLOGUE(mpn_copyd) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + cmpl $UNROLL_THRESHOLD, %ecx + jae L(unroll) + + orl %ecx, %ecx + jz L(simple_done) + +L(simple): + C eax src + C ebx scratch + C ecx counter + C edx dst + C + C this loop is 2 cycles/limb + + movl -4(%eax,%ecx,4), %ebx + movl %ebx, -4(%edx,%ecx,4) + decl %ecx + jnz L(simple) + +L(simple_done): + movl SAVE_EBX, %ebx + ret + + +L(unroll): + movl %esi, SAVE_ESI + leal (%eax,%ecx,4), %ebx + leal (%edx,%ecx,4), %esi + + andl %esi, %ebx + movl SAVE_ESI, %esi + subl $4, %ecx C size-4 + + testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) + jz L(aligned) + + C both src and dst unaligned, process one limb to align them + movl 12(%eax,%ecx,4), %ebx + movl %ebx, 12(%edx,%ecx,4) + decl %ecx +L(aligned): + + + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter, limbs + C edx dst + + movq 8(%eax,%ecx,4), %mm0 + movq (%eax,%ecx,4), %mm1 + subl $4, %ecx + movq %mm0, 16+8(%edx,%ecx,4) + movq %mm1, 16(%edx,%ecx,4) + jns L(top) + + + C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %cl + jz L(finish_not_two) + + movq 8(%eax,%ecx,4), %mm0 + movq %mm0, 8(%edx,%ecx,4) +L(finish_not_two): + + testb $1, %cl + jz L(done) + + movl (%eax), %ebx + movl %ebx, (%edx) + +L(done): + movl SAVE_EBX, %ebx + emms + ret + + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm b/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm new file mode 100644 index 0000000..9a28f92 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mmx/copyi.asm @@ -0,0 +1,157 @@ +dnl AMD K7 mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C alignment dst/src, A=0mod8 N=4mod8 +C A/A A/N N/A N/N +C K7 0.75 1.0 1.0 0.75 + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size. +C +C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at +C 1.33 c/l. +C +C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization +C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing +C under 0.7 c/l is known. Apparently only two 32-bit stores can be done in +C one cycle, so perhaps some scheduling is needed to ensure it's a +C load+store in each cycle, not store+store. +C +C If both source and destination are unaligned then one limb is processed at +C the start to make them aligned and so get 0.75 c/l, whereas if they'd been +C used unaligned it would be 1.5 c/l. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl parameter space reused +define(SAVE_EBX,`PARAM_SIZE') + +dnl minimum 5 since the unrolled code can't handle less than 5 +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(32) +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + cmpl $UNROLL_THRESHOLD, %ecx + jae L(unroll) + + orl %ecx, %ecx + jz L(simple_done) + +L(simple): + C eax src, incrementing + C ebx scratch + C ecx counter + C edx dst, incrementing + C + C this loop is 2 cycles/limb + + movl (%eax), %ebx + movl %ebx, (%edx) + decl %ecx + leal 4(%eax), %eax + leal 4(%edx), %edx + jnz L(simple) + +L(simple_done): + movl SAVE_EBX, %ebx + ret + + +L(unroll): + movl %eax, %ebx + leal -12(%eax,%ecx,4), %eax C src end - 12 + subl $3, %ecx C size-3 + + andl %edx, %ebx + leal (%edx,%ecx,4), %edx C dst end - 12 + negl %ecx + + testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) + jz L(aligned) + + C both src and dst unaligned, process one limb to align them + movl (%eax,%ecx,4), %ebx + movl %ebx, (%edx,%ecx,4) + incl %ecx +L(aligned): + + + ALIGN(16) +L(top): + C eax src end - 12 + C ebx + C ecx counter, negative, limbs + C edx dst end - 12 + + movq (%eax,%ecx,4), %mm0 + movq 8(%eax,%ecx,4), %mm1 + addl $4, %ecx + movq %mm0, -16(%edx,%ecx,4) + movq %mm1, -16+8(%edx,%ecx,4) + ja L(top) C jump no carry and not zero + + + C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %cl + jnz L(finish_not_two) + + movq (%eax,%ecx,4), %mm0 + movq %mm0, (%edx,%ecx,4) +L(finish_not_two): + + testb $1, %cl + jnz L(done) + + movl 8(%eax), %ebx + movl %ebx, 8(%edx) + +L(done): + movl SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm b/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm new file mode 100644 index 0000000..cf34328 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mmx/divrem_1.asm @@ -0,0 +1,832 @@ +dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb +dnl division. + +dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); +C +C Algorithm: +C +C The method and nomenclature follow part 8 of "Division by Invariant +C Integers using Multiplication" by Granlund and Montgomery, reference in +C gmp.texi. +C +C The "and"s shown in the paper are done here with "cmov"s. "m" is written +C for m', and "d" for d_norm, which won't cause any confusion since it's +C only the normalized divisor that's of any use in the code. "b" is written +C for 2^N, the size of a limb, N being 32 here. +C +C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as +C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If +C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case +C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then +C if q1==0xFFFFFFFF that must be the right value. +C +C For the last and second last steps q1==0xFFFFFFFF is instead handled by an +C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This +C then goes through as normal, and finding no addback required. sbbl costs +C an extra cycle over what the main loop code does, but it keeps code size +C and complexity down. +C +C Notes: +C +C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high +C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero +C carry, since in normal circumstances that will be a very rare event. +C +C The test for skipping a division is branch free (once size>=1 is tested). +C The store to the destination high limb is 0 when a divide is skipped, or +C if it's not skipped then a copy of the src high limb is used. The latter +C is in case src==dst. +C +C There's a small bias towards expecting xsize==0, by having code for +C xsize==0 in a straight line and xsize!=0 under forward jumps. +C +C Alternatives: +C +C If the divisor is normalized (high bit set) then a division step can +C always be skipped, since the high destination limb is always 0 or 1 in +C that case. It doesn't seem worth checking for this though, since it +C probably occurs infrequently, in particular note that big_base for a +C decimal mpn_get_str is not normalized in a 32-bit limb. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The inverse takes about 50 cycles to calculate, but after that the +dnl multiply is 17 c/l versus division at 42 c/l. +dnl +dnl At 3 limbs the mul is a touch faster than div on the integer part, and +dnl even more so on the fractional part. + +deflit(MUL_THRESHOLD, 3) + + +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + movl PARAM_XSIZE, %ecx + movl PARAM_DST, %edx + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edx, VAR_DST_STOP C &dst[xsize+2] + movl %edi, SAVE_EDI + xorl %edi, %edi C carry + + movl -4(%esi,%ebx,4), %eax C src high limb + xor %ecx, %ecx + + C + + C + + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edi) C high is carry if high n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + cmpl %eax, %ecx + + movl %ebx, (%ecx) + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 1 cycle (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl PARAM_SRC, %ecx + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx dst + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm7 rshift + + movl VAR_DST_STOP, %ecx + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + +L(integer_none): + cmpl $0, PARAM_XSIZE + jne L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx +L(zero_done): + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + movl %ecx, VAR_DST + + movd %mm0, %esi C next n10 + + movl $-1, (%ecx) + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C Being the fractional part, the "source" limbs are all zero, meaning +C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated. +C +C The loop runs at 15 cycles. The dependent chain is the same as the +C general case above, but without the n2+n1 stage (due to n1==0), so 15 +C would seem to be the lower bound. +C +C A not entirely obvious simplification is that q1+1 never overflows a limb, +C and so there's no need for the sbbl $0 or jz q1_ff from the general case. +C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always. +C rnd() means rounding down to a multiple of d. +C +C m*n2 + b*n2 <= m*(d-1) + b*(d-1) +C = m*d + b*d - m - b +C = floor((b(b-d)-1)/d)*d + b*d - m - b +C = rnd(b(b-d)-1) + b*d - m - b +C = rnd(b(b-d)-1 + b*d) - m - b +C = rnd(b*b-1) - m - b +C <= (b-2)*b +C +C Unchanged from the general case is that the final quotient limb q can be +C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from +C equation 8.4 of the paper which simplifies as follows when n1==0 and +C n0==0. +C +C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b +C +C As before, the instruction groupings and empty comments show a naive +C in-order view of the code, which is made a nonsense by out of order +C execution. There's 17 cycles shown, but it executes at 15. +C +C Rotating the store q and remainder->n2 instructions up to the top of the +C loop gets the run time down from 16 to 15. + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx C &dst[xsize+2] + movl %edi, %eax + + subl $8, %ecx C &dst[xsize] + jmp L(fraction_entry) + + + ALIGN(16) +L(fraction_top): + C eax n2 carry, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi (will be n2) + C ebp divisor + + movl %ebx, (%ecx) C previous q + movl %eax, %edi C remainder->n2 + +L(fraction_entry): + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + C + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + cmpl %esi, %ecx + + jne L(fraction_top) + + + movl %ebx, (%ecx) + jmp L(fraction_done) + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm b/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm new file mode 100644 index 0000000..b3383cf --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mmx/lshift.asm @@ -0,0 +1,481 @@ +dnl AMD K7 mpn_lshift -- mpn left shift. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 1.21 cycles/limb (at 16 limbs/loop). + + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 1.51 +dnl 8 1.26 +dnl 16 1.21 +dnl 32 1.2 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. The bits shifted out at the left are +C the return value. +C +C The comments in mpn_rshift apply here too. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 10) +',` +deflit(UNROLL_THRESHOLD, 10) +') + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EDI, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +deflit(SAVE_SIZE, 12) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + subl $SAVE_SIZE, %esp +deflit(`FRAME',SAVE_SIZE) + + movl PARAM_SHIFT, %ecx + movl %edi, SAVE_EDI + + movl PARAM_DST, %edi + decl %eax + jnz L(more_than_one_limb) + + movl (%edx), %edx + + shldl( %cl, %edx, %eax) C eax was decremented to zero + + shll %cl, %edx + + movl %edx, (%edi) + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(more_than_one_limb): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + + movd PARAM_SHIFT, %mm6 + movd (%edx,%eax,4), %mm5 C src high limb + cmp $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + negl %ecx + movd (%edx), %mm4 C src low limb + + addl $32, %ecx + + movd %ecx, %mm7 + +L(simple_top): + C eax loop counter, limbs + C ebx + C ecx + C edx src + C esi + C edi dst + C ebp + C + C mm0 scratch + C mm4 src low limb + C mm5 src high limb + C mm6 shift + C mm7 32-shift + + movq -4(%edx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%edi,%eax,4) + jnz L(simple_top) + + + psllq %mm6, %mm5 + psllq %mm6, %mm4 + + psrlq $32, %mm5 + movd %mm4, (%edi) C dst low limb + + movd %mm5, %eax C return value + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx (saved) + C ecx shift + C edx src + C esi + C edi dst + C ebp + C + C mm5 src high limb, for return value + C mm6 lshift + + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX + leal -4(%edx,%eax,4), %edx C &src[size-2] + + testb $4, %dl + movq (%edx), %mm1 C src high qword + + jz L(start_src_aligned) + + + C src isn't aligned, process high limb (marked xxx) separately to + C make it so + C + C source -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest -4(edi,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + psllq %mm6, %mm1 + subl $4, %edx + movl %eax, PARAM_SIZE C size-1 + + psrlq $32, %mm1 + decl %eax C size-2 is new size-1 + + movd %mm1, 4(%edi,%eax,4) + movq (%edx), %mm1 C new src high qword +L(start_src_aligned): + + + leal -4(%edi,%eax,4), %edi C &dst[size-2] + psllq %mm6, %mm5 + + testl $4, %edi + psrlq $32, %mm5 C return value + + jz L(start_dst_aligned) + + + C dst isn't aligned, subtract 4 bytes to make it so, and pretend the + C shift is 32 bits extra. High limb of dst (marked xxx) handled + C here separately. + C + C source %edx + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest %edi + C +-------+-------+-------+-- + C | xxx | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + psllq %mm6, %mm1 + addl $32, %ecx C shift+32 + + psrlq $32, %mm1 + + movd %mm1, 4(%edi) + movq %mm0, %mm1 + subl $4, %edi + + movd %ecx, %mm6 C new lshift +L(start_dst_aligned): + + decl %eax C size-2, two last limbs handled at end + movq %mm1, %mm2 C copy of src high qword + negl %ecx + + andl $-2, %eax C round size down to even + addl $64, %ecx + + movl %eax, %ebx + negl %eax + + andl $UNROLL_MASK, %eax + decl %ebx + + shll %eax + + movd %ecx, %mm7 C rshift = 64-lshift + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%eax,%eax,4), %esi +') + shrl $UNROLL_LOG2, %ebx C loop counter + + leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx + leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi + movl PARAM_SIZE, %eax C for use at end + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%eax,%eax,4), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + + ret_internal +') + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax size (for use at end) + C ebx loop counter + C ecx rshift + C edx src + C esi computed jump + C edi dst + C ebp + C + C mm0 scratch + C mm1 \ carry (alternating, mm2 first) + C mm2 / + C mm6 lshift + C mm7 rshift + C + C 10 code bytes/limb + C + C The two chunks differ in whether mm1 or mm2 hold the carry. + C The computed jump puts the initial carry in both mm1 and mm2. + +L(entry): +deflit(CHUNK_COUNT, 4) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 - 8)) + +Zdisp( movq, disp0,(%edx), %mm0) + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 +Zdisp( movq, %mm0, disp0,(%edi)) + + +Zdisp( movq, disp1,(%edx), %mm0) + psllq %mm6, %mm1 + + movq %mm0, %mm2 + psrlq %mm7, %mm0 + + por %mm1, %mm0 +Zdisp( movq, %mm0, disp1,(%edi)) +') + + subl $UNROLL_BYTES, %edx + subl $UNROLL_BYTES, %edi + decl %ebx + + jns L(top) + + + +define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))') + +L(end): + testb $1, %al + movl SAVE_EBX, %ebx + psllq %mm6, %mm2 C wanted left shifted in all cases below + + movd %mm5, %eax + + movl SAVE_ESI, %esi + jz L(end_even) + + +L(end_odd): + + C Size odd, destination was aligned. + C + C source edx+8 edx+4 + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edi + C --+---------------+---------------+-------+ + C | written | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size odd, destination was unaligned. + C + C source edx+8 edx+4 + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edi + C --+---------------+---------------+ + C | written | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at (%edi), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + movd disp(4) (%edx), %mm0 + testb $32, %cl + + movq %mm0, %mm1 + psllq $32, %mm0 + + psrlq %mm7, %mm0 + psllq %mm6, %mm1 + + por %mm2, %mm0 + + movq %mm0, disp(0) (%edi) + jz L(end_odd_unaligned) + movd %mm1, disp(-4) (%edi) +L(end_odd_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +L(end_even): + + C Size even, destination was aligned. + C + C source edx+8 + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edi + C --+---------------+---------------+ + C | written | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size even, destination was unaligned. + C + C source edx+8 + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edi+4 + C --+---------------+-------+ + C | written | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movq for the aligned case overwrites the movd for the + C unaligned case. + + movq %mm2, %mm0 + psrlq $32, %mm2 + + testb $32, %cl + movd %mm2, disp(4) (%edi) + + jz L(end_even_unaligned) + movq %mm0, disp(0) (%edi) +L(end_even_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm b/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm new file mode 100644 index 0000000..95965b7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mmx/popham.asm @@ -0,0 +1,213 @@ +dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming +dnl distance. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C popcount hamdist +C P3 generic 6.5 7 +C P3 model 9 (Banias) 5.7 6.1 +C P3 model 13 (Dothan) 5.75 6 +C K7 5 6 + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here is almost certainly not optimal, but is already a 3x speedup +C over the generic C code. The main improvement would be to interleave +C processing of two qwords in the loop so as to fully exploit the available +C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs). +C +C The loop is based on the example "Efficient 64-bit population count using +C MMX instructions" in the Athlon Optimization Guide, AMD document 22007, +C page 158 of rev E (reference in mpn/x86/k7/README). + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined +')')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + RODATA + ALIGN(8) + +L(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +L(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +L(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F +') + + TEXT + ALIGN(32) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + +',` + movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq L(rodata_3333333333333333), %mm6 + movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 +') + pxor %mm4, %mm4 + +define(REG_AAAAAAAAAAAAAAAA,%mm7) +define(REG_3333333333333333,%mm6) +define(REG_0F0F0F0F0F0F0F0F,%mm5) +define(REG_0000000000000000,%mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + + movd (%eax,%ecx,8), %mm1 + +HAM(` movd (%edx,%ecx,8), %mm0 + pxor %mm0, %mm1 +') + orl %ecx, %ecx + jmp L(loaded) + + + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 + +HAM(` pxor -8(%edx,%ecx,8), %mm1') + decl %ecx + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 + + psubd %mm1, %mm0 C bit pairs + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + + psadbw( %mm4, %mm0) + + paddd %mm0, %mm2 C add to total + jnz L(top) + + + movd %mm2, %eax + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm b/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm new file mode 100644 index 0000000..345d23a --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mmx/rshift.asm @@ -0,0 +1,480 @@ +dnl AMD K7 mpn_rshift -- mpn right shift. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: 1.21 cycles/limb (at 16 limbs/loop). + + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 1.51 +dnl 8 1.26 +dnl 16 1.21 +dnl 32 1.2 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. The bits shifted out at the right are +C the return value. +C +C This code uses 64-bit MMX operations, which makes it possible to handle +C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer +C code, on the other hand, suffers from shrd being a vector path decode and +C running at 3 cycles back-to-back. +C +C Full speed depends on source and destination being aligned, and some hairy +C setups and finish-ups are done to arrange this for the loop. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 10) +',` +deflit(UNROLL_THRESHOLD, 10) +') + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EDI, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +deflit(SAVE_SIZE, 12) + + TEXT + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + subl $SAVE_SIZE, %esp +deflit(`FRAME',SAVE_SIZE) + + movl PARAM_SHIFT, %ecx + movl %edi, SAVE_EDI + + movl PARAM_DST, %edi + decl %eax + jnz L(more_than_one_limb) + + movl (%edx), %edx C src limb + + shrdl( %cl, %edx, %eax) C eax was decremented to zero + + shrl %cl, %edx + + movl %edx, (%edi) C dst limb + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(more_than_one_limb): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + + movd PARAM_SHIFT, %mm6 C rshift + movd (%edx), %mm5 C src low limb + cmp $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + leal (%edx,%eax,4), %edx C &src[size-1] + leal -4(%edi,%eax,4), %edi C &dst[size-2] + + movd (%edx), %mm4 C src high limb + negl %eax + + +L(simple_top): + C eax loop counter, limbs, negative + C ebx + C ecx shift + C edx carry + C edx &src[size-1] + C edi &dst[size-2] + C ebp + C + C mm0 scratch + C mm4 src high limb + C mm5 src low limb + C mm6 shift + + movq (%edx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edi,%eax,4) + jnz L(simple_top) + + + psllq $32, %mm5 + psrlq %mm6, %mm4 + + psrlq %mm6, %mm5 + movd %mm4, 4(%edi) C dst high limb + + movd %mm5, %eax C return value + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + C + C mm5 src low limb + C mm6 rshift + + testb $4, %dl + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edi + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%edx), %mm0 C src low two limbs + addl $4, %edx + movl %eax, PARAM_SIZE C size-1 + + addl $4, %edi + decl %eax C size-2 is new size-1 + + psrlq %mm6, %mm0 + movl %edi, PARAM_DST C new dst + + movd %mm0, -4(%edi) +L(start_src_aligned): + + + movq (%edx), %mm1 C src low two limbs + decl %eax C size-2, two last limbs handled at end + testl $4, %edi + + psrlq %mm6, %mm5 + jz L(start_dst_aligned) + + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here separately. + C + C source edx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edi + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + psrlq %mm6, %mm1 + addl $32, %ecx C shift+32 + + movd %mm1, (%edi) + movq %mm0, %mm1 + addl $4, %edi C new dst + + movd %ecx, %mm6 +L(start_dst_aligned): + + + movq %mm1, %mm2 C copy of src low two limbs + negl %ecx + andl $-2, %eax C round size down to even + + movl %eax, %ebx + negl %eax + addl $64, %ecx + + andl $UNROLL_MASK, %eax + decl %ebx + + shll %eax + + movd %ecx, %mm7 C lshift = 64-rshift + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%eax,%eax,4), %esi + negl %eax +') + shrl $UNROLL_LOG2, %ebx C loop counter + + leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx + leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi + movl PARAM_SIZE, %eax C for use at end + + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%eax,%eax,4), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + negl %eax + + ret_internal +') + + +C ----------------------------------------------------------------------------- + ALIGN(64) +L(top): + C eax size, for use at end + C ebx loop counter + C ecx lshift + C edx src + C esi was computed jump + C edi dst + C ebp + C + C mm0 scratch + C mm1 \ carry (alternating) + C mm2 / + C mm6 rshift + C mm7 lshift + C + C 10 code bytes/limb + C + C The two chunks differ in whether mm1 or mm2 hold the carry. + C The computed jump puts the initial carry in both mm1 and mm2. + +L(entry): +deflit(CHUNK_COUNT, 4) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 8)) + +Zdisp( movq, disp0,(%edx), %mm0) + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm2, %mm0 +Zdisp( movq, %mm0, disp0,(%edi)) + + +Zdisp( movq, disp1,(%edx), %mm0) + psrlq %mm6, %mm1 + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm1, %mm0 +Zdisp( movq, %mm0, disp1,(%edi)) +') + + addl $UNROLL_BYTES, %edx + addl $UNROLL_BYTES, %edi + decl %ebx + + jns L(top) + + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 8)) + + testb $1, %al + psrlq %mm6, %mm2 C wanted rshifted in all cases below + movl SAVE_ESI, %esi + + movd %mm5, %eax C return value + + movl SAVE_EBX, %ebx + jz L(end_even) + + + C Size odd, destination was aligned. + C + C source + C edx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edi + C +-------+---------------+---------------+-- + C | | | written | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size odd, destination was unaligned. + C + C source + C edx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edi + C +---------------+---------------+-- + C | | written | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword to store, and in the aligned case there's + C a further extra limb of dst to be formed. + + + movd disp0(%edx), %mm0 + movq %mm0, %mm1 + + psllq %mm7, %mm0 + testb $32, %cl + + por %mm2, %mm0 + psrlq %mm6, %mm1 + + movq %mm0, disp0(%edi) + jz L(finish_odd_unaligned) + + movd %mm1, disp1(%edi) +L(finish_odd_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +L(end_even): + + C Size even, destination was aligned. + C + C source + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edi + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size even, destination was unaligned. + C + C source + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edi + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is the same data as the movq for + C the aligned case, it's just a choice between whether one or two + C limbs should be written. + + + testb $32, %cl + movd %mm2, disp0(%edi) + + jz L(end_even_unaligned) + + movq %mm2, disp0(%edi) +L(end_even_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm b/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm new file mode 100644 index 0000000..1bbe6f9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm @@ -0,0 +1,221 @@ +dnl x86-32 mpn_mod_1_1p, requiring cmov. + +dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund. + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) ? +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 ? +C AMD K7 7 +C AMD K8 ? + +define(`B2mb', `%ebx') +define(`r0', `%esi') +define(`r2', `%ebp') +define(`t0', `%edi') +define(`ap', `%ecx') C Also shift count + +C Stack frame +C pre 36(%esp) +C b 32(%esp) +C n 28(%esp) +C ap 24(%esp) +C return 20(%esp) +C %ebp 16(%esp) +C %edi 12(%esp) +C %esi 8(%esp) +C %ebx 4(%esp) +C B2mod (%esp) + +define(`B2modb', `(%esp)') +define(`n', `28(%esp)') +define(`b', `32(%esp)') +define(`pre', `36(%esp)') + +C mp_limb_t +C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4]) +C +C The pre array contains bi, cnt, B1modb, B2modb +C Note: This implementation needs B1modb only when cnt > 0 + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(mpn_mod_1_1p) + push %ebp + push %edi + push %esi + push %ebx + mov 32(%esp), %ebp C pre[] + + mov 12(%ebp), %eax C B2modb + push %eax C Put it on stack + + mov n, %edx + mov 24(%esp), ap + + lea (ap, %edx, 4), ap + mov -4(ap), %eax + cmp $3, %edx + jnc L(first) + mov -8(ap), r0 + jmp L(reduce_two) + +L(first): + C First iteration, no r2 + mull B2modb + mov -12(ap), r0 + add %eax, r0 + mov -8(ap), %eax + adc %edx, %eax + sbb r2, r2 + subl $3, n + lea -16(ap), ap + jz L(reduce_three) + + mov B2modb, B2mb + sub b, B2mb + lea (B2mb, r0), t0 + jmp L(mid) + + ALIGN(16) +L(top): C Loopmixed to 7 c/l on k7 + add %eax, r0 + lea (B2mb, r0), t0 + mov r2, %eax + adc %edx, %eax + sbb r2, r2 +L(mid): mull B2modb + and B2modb, r2 + add r0, r2 + decl n + mov (ap), r0 + cmovc( t0, r2) + lea -4(ap), ap + jnz L(top) + + add %eax, r0 + mov r2, %eax + adc %edx, %eax + sbb r2, r2 + +L(reduce_three): + C Eliminate r2 + and b, r2 + sub r2, %eax + +L(reduce_two): + mov pre, %ebp + movb 4(%ebp), %cl + test %cl, %cl + jz L(normalized) + + C Unnormalized, use B1modb to reduce to size < B b + mull 8(%ebp) + xor t0, t0 + add %eax, r0 + adc %edx, t0 + mov t0, %eax + + C Left-shift to normalize + shld %cl, r0, %eax C Always use shld? + + shl %cl, r0 + jmp L(udiv) + +L(normalized): + mov %eax, t0 + sub b, t0 + cmovnc( t0, %eax) + +L(udiv): + lea 1(%eax), t0 + mull (%ebp) + mov b, %ebx C Needed in register for lea + add r0, %eax + adc t0, %edx + imul %ebx, %edx + sub %edx, r0 + cmp r0, %eax + lea (%ebx, r0), %eax + cmovnc( r0, %eax) + cmp %ebx, %eax + jnc L(fix) +L(ok): shr %cl, %eax + + add $4, %esp + pop %ebx + pop %esi + pop %edi + pop %ebp + + ret +L(fix): sub %ebx, %eax + jmp L(ok) +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps) + push %ebp + mov 12(%esp), %ebp + push %esi + bsr %ebp, %ecx + push %ebx + xor $31, %ecx + mov 16(%esp), %esi + sal %cl, %ebp + mov %ebp, %edx + not %edx + mov $-1, %eax + div %ebp C On K7, invert_limb would be a few cycles faster. + mov %eax, (%esi) C store bi + mov %ecx, 4(%esi) C store cnt + neg %ebp + mov $1, %edx + shld %cl, %eax, %edx + imul %ebp, %edx + shr %cl, %edx + imul %ebp, %eax + mov %edx, 8(%esi) C store B1modb + mov %eax, 12(%esi) C store B2modb + pop %ebx + pop %esi + pop %ebp + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm b/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm new file mode 100644 index 0000000..bb7597e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm @@ -0,0 +1,260 @@ +dnl x86-32 mpn_mod_1s_4p, requiring cmov. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 6 +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) 15.5 +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 ? +C AMD K7 4.75 +C AMD K8 ? + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p) + push %ebp + push %edi + push %esi + push %ebx + sub $28, %esp + mov 60(%esp), %edi C cps[] + mov 8(%edi), %eax + mov 12(%edi), %edx + mov 16(%edi), %ecx + mov 20(%edi), %esi + mov 24(%edi), %edi + mov %eax, 4(%esp) + mov %edx, 8(%esp) + mov %ecx, 12(%esp) + mov %esi, 16(%esp) + mov %edi, 20(%esp) + mov 52(%esp), %eax C n + xor %edi, %edi + mov 48(%esp), %esi C up + lea -12(%esi,%eax,4), %esi + and $3, %eax + je L(b0) + cmp $2, %eax + jc L(b1) + je L(b2) + +L(b3): mov 4(%esi), %eax + mull 4(%esp) + mov (%esi), %ebp + add %eax, %ebp + adc %edx, %edi + mov 8(%esi), %eax + mull 8(%esp) + lea -12(%esi), %esi + jmp L(m0) + +L(b0): mov (%esi), %eax + mull 4(%esp) + mov -4(%esi), %ebp + add %eax, %ebp + adc %edx, %edi + mov 4(%esi), %eax + mull 8(%esp) + add %eax, %ebp + adc %edx, %edi + mov 8(%esi), %eax + mull 12(%esp) + lea -16(%esi), %esi + jmp L(m0) + +L(b1): mov 8(%esi), %ebp + lea -4(%esi), %esi + jmp L(m1) + +L(b2): mov 8(%esi), %edi + mov 4(%esi), %ebp + lea -8(%esi), %esi + jmp L(m1) + + ALIGN(16) +L(top): mov (%esi), %eax + mull 4(%esp) + mov -4(%esi), %ebx + xor %ecx, %ecx + add %eax, %ebx + adc %edx, %ecx + mov 4(%esi), %eax + mull 8(%esp) + add %eax, %ebx + adc %edx, %ecx + mov 8(%esi), %eax + mull 12(%esp) + add %eax, %ebx + adc %edx, %ecx + lea -16(%esi), %esi + mov 16(%esp), %eax + mul %ebp + add %eax, %ebx + adc %edx, %ecx + mov 20(%esp), %eax + mul %edi + mov %ebx, %ebp + mov %ecx, %edi +L(m0): add %eax, %ebp + adc %edx, %edi +L(m1): subl $4, 52(%esp) + ja L(top) + +L(end): mov 4(%esp), %eax + mul %edi + mov 60(%esp), %edi + add %eax, %ebp + adc $0, %edx + mov 4(%edi), %ecx + mov %edx, %esi + mov %ebp, %eax + sal %cl, %esi + mov %ecx, %ebx + neg %ecx + shr %cl, %eax + or %esi, %eax + lea 1(%eax), %esi + mull (%edi) + mov %ebx, %ecx + mov %eax, %ebx + mov %ebp, %eax + mov 56(%esp), %ebp + sal %cl, %eax + add %eax, %ebx + adc %esi, %edx + imul %ebp, %edx + sub %edx, %eax + lea (%eax,%ebp), %edx + cmp %eax, %ebx + cmovc( %edx, %eax) + mov %eax, %edx + sub %ebp, %eax + cmovc( %edx, %eax) + add $28, %esp + pop %ebx + pop %esi + pop %edi + pop %ebp + shr %cl, %eax + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p_cps) +C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm + push %ebp + push %edi + push %esi + push %ebx + mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx + mov 24(%esp), %ebx + bsr %ebx, %ecx + xor $31, %ecx + sal %cl, %ebx C b << cnt + mov %ebx, %edx + not %edx + mov $-1, %eax + div %ebx + xor %edi, %edi + sub %ebx, %edi + mov $1, %esi + mov %eax, (%ebp) C store bi + mov %ecx, 4(%ebp) C store cnt + shld %cl, %eax, %esi + imul %edi, %esi + mov %eax, %edi + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 8(%ebp) C store B1modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 12(%ebp) C store B2modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 16(%ebp) C store B3modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 20(%ebp) C store B4modb + + not %edx + imul %ebx, %edx + add %edx, %ebx + cmp %edx, %eax + cmovnc( %edx, %ebx) + + shr %cl, %ebx + mov %ebx, 24(%ebp) C store B5modb + + pop %ebx + pop %esi + pop %edi + pop %ebp + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm new file mode 100644 index 0000000..ee3ad04 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mod_34lsub1.asm @@ -0,0 +1,188 @@ +dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Athlon: 1 +C Hammer: 1 + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C The loop form below and the 64 byte code alignment seem necessary for the +C claimed speed. This is a bit strange, since normally k7 isn't very +C sensitive to such things. Perhaps there has to be 6 instructions in the +C first 16 bytes for the BTB entry or something. + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EDI, `PARAM_SIZE') + + TEXT + ALIGN(64) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + + subl $2, %ecx + ja L(three_or_more) + + movl (%edx), %eax + jb L(one) + + movl 4(%edx), %ecx + movl %eax, %edx + shrl $24, %eax C src[0] low + + andl $0xFFFFFF, %edx C src[0] high + addl %edx, %eax + movl %ecx, %edx + + andl $0xFFFF, %ecx + shrl $16, %edx C src[1] high + addl %edx, %eax + + shll $8, %ecx C src[1] low + addl %ecx, %eax + +L(one): + ret + + +L(three_or_more): + C eax + C ebx + C ecx size-2 + C edx src + C esi + C edi + + pushl %ebx FRAME_pushl() + xorl %eax, %eax + xorl %ebx, %ebx + + movl %edi, SAVE_EDI + pushl %esi FRAME_pushl() + xorl %esi, %esi C and clear carry flag + + + C code offset 0x40 at this point +L(top): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx counter, limbs + C edx src + C esi acc 2mod3 + C edi + + leal 24(%edx), %edx + leal -2(%ecx), %ecx + adcl -24(%edx), %eax + adcl -20(%edx), %ebx + adcl -16(%edx), %esi + + decl %ecx + jng L(done_loop) + + leal -2(%ecx), %ecx + adcl -12(%edx), %eax + adcl -8(%edx), %ebx + adcl -4(%edx), %esi + + decl %ecx + jg L(top) + + + leal 12(%edx), %edx + + +L(done_loop): + C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively + + incl %ecx + movl $0xFFFFFFFF, %edi + js L(combine) + + adcl -12(%edx), %eax + decl %ecx + movl $0xFFFFFF00, %edi + js L(combine) + + adcl -8(%edx), %ebx + movl $0xFFFF0000, %edi + + +L(combine): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx + C edx + C esi acc 2mod3 + C edi mask + + sbbl %ecx, %ecx C carry + movl %eax, %edx C 0mod3 + shrl $24, %eax C 0mod3 high + + andl %edi, %ecx C carry masked + andl $0x00FFFFFF, %edx C 0mod3 low + movl %ebx, %edi C 1mod3 + + subl %ecx, %eax C apply carry + shrl $16, %ebx C 1mod3 high + andl $0xFFFF, %edi + + addl %edx, %eax C apply 0mod3 low + movl %esi, %edx C 2mod3 + shll $8, %edi C 1mod3 low + + addl %ebx, %eax C apply 1mod3 high + shrl $8, %esi C 2mod3 high + movzbl %dl, %edx C 2mod3 low + + addl %edi, %eax C apply 1mod3 low + shll $16, %edx C 2mod3 low + + addl %esi, %eax C apply 2mod3 high + popl %esi FRAME_popl() + + movl SAVE_EDI, %edi + addl %edx, %eax C apply 2mod3 low + popl %ebx FRAME_popl() + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/mode1o.asm b/gmp-6.3.0/mpn/x86/k7/mode1o.asm new file mode 100644 index 0000000..2394033 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mode1o.asm @@ -0,0 +1,181 @@ +dnl AMD K7 mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C Athlon: 11.0 +C Hammer: 7.0 + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C With the loop running at just 11 cycles it doesn't seem worth bothering to +C check for high31),1, +eval((UNROLL_COUNT-31)*4), +0)) + +dnl Because the last chunk of code is generated differently, a label placed +dnl at the end doesn't work. Instead calculate the implied end using the +dnl start and how many chunks of code there are. + +deflit(UNROLL_INNER_END, +`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)') + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + movl %ebx, (%edi) + + subl $4, %ecx + jz L(corner) + + negl %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %edi') +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + + movl %ecx, %edx + shll $4, %ecx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + + + C The calculated jump mustn't come out to before the start of the + C code available. This is the limit UNROLL_COUNT puts on the src + C operand size, but checked here directly using the jump address. + ASSERT(ae, + `movl_text_address(L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C------------------------------------------------------------------------------ + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, high of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier + movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand + + movl %edx, VAR_COUNTER + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + movl %edx, %ebx C high carry + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + + leal CODE_BYTES_PER_LIMB(%edx), %eax + xorl %edx, %edx + leal 4(%edi), %edi + + movl %eax, VAR_JMP + + jmp *%eax + + +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx + addl %edx, %ecx + ret_internal +') + + + C Must be an even address to preserve the significance of the low + C bit of the jump address indicating which way around ecx/ebx should + C start. + ALIGN(2) + +L(unroll_inner_start): + C eax next limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src + C edi dst + C ebp multiplier + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src - 4)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + adcl %edx, %ebx + + mull %ebp + +Zdisp( addl, %ecx, disp_dst,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + +',` + dnl this bit comes out last +Zdisp( movl, disp_src,(%esi), %eax) + adcl %edx, %ecx + + mull %ebp + +Zdisp( addl, %ebx, disp_dst,(%edi)) + +ifelse(forloop_last,0, +` movl $0, %ebx') + + adcl %eax, %ecx +') +') + + C eax next limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src + C edi dst + C ebp multiplier + + adcl $0, %edx + addl %ecx, -4+OFFSET(%edi) + movl VAR_JMP, %ecx + + adcl $0, %edx + + movl %edx, m4_empty_if_zero(OFFSET) (%edi) + movl VAR_COUNTER, %edx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C------------------------------------------------------------------------------ +L(corner): + C esi &src[size] + C edi &dst[2*size-5] + + movl -12(%esi), %ebp + movl -8(%esi), %eax + movl %eax, %ecx + + mull %ebp + + addl %eax, -4(%edi) + movl -4(%esi), %eax + + adcl $0, %edx + movl %edx, %ebx + movl %eax, %esi + + mull %ebp + + addl %ebx, %eax + + adcl $0, %edx + addl %eax, (%edi) + movl %esi, %eax + + adcl $0, %edx + movl %edx, %ebx + + mull %ecx + + addl %ebx, %eax + movl %eax, 4(%edi) + + adcl $0, %edx + movl %edx, 8(%edi) + + + +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift_start): + movl PARAM_SIZE, %eax + movl PARAM_DST, %edi + xorl %ecx, %ecx C clear carry + + leal (%edi,%eax,8), %edi + notl %eax C -size-1, preserve carry + + leal 2(%eax), %eax C -(size-1) + +L(lshift): + C eax counter, negative + C ebx + C ecx + C edx + C esi + C edi dst, pointing just after last limb + C ebp + + rcll -4(%edi,%eax,8) + rcll (%edi,%eax,8) + incl %eax + jnz L(lshift) + + setc %al + + movl PARAM_SRC, %esi + movl %eax, -4(%edi) C dst most significant limb + + movl PARAM_SIZE, %ecx + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + + mull %eax + + leal (%esi,%ecx,4), %esi C src point just after last limb + negl %ecx + + movl %eax, (%edi,%ecx,8) C dst[0] + incl %ecx + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi src just after last limb + C edi dst just after last limb + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, -4(%edi,%ecx,8) + adcl %eax, (%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, -4(%edi) C dst most significant limb + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm b/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm new file mode 100644 index 0000000..8851683 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/sublsh1_n.asm @@ -0,0 +1,173 @@ +dnl AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1) + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns. The +C innerloop is 2*3-way unrolled, which is best we can do with the available +C registers. It seems tricky to use the same structure for rsblsh1_n, since we +C cannot feed carry between operations there. + +C cycles/limb +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 6.75 +C AMD K6 +C AMD K7 +C AMD K8 + +C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32 +C processors. It uses 2*4-way unrolling, for good reasons. +C +C Breaking carry recurrency might be a good idea. We would then need separate +C registers for the shift carry and add/subtract carry, which in turn would +C force us to 2*2-way unrolling. + +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_COUNT,`PARAM_SIZE') +define(SAVE_EBX,`PARAM_SRC') +define(SAVE_EBP,`PARAM_DST') + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(mpn_sublsh1_n_ip1) +deflit(`FRAME',0) + +define(`rp', `%edi') +define(`up', `%esi') + + mov PARAM_SIZE, %eax C size + push up FRAME_pushl() + push rp FRAME_pushl() + xor %edx, %edx + mov PARAM_SRC, up + mov PARAM_DST, rp + mov %ebx, SAVE_EBX + mov %eax, %ebx + shr $3, %eax + + not %eax C count = -(size\8)-i + and $7, %ebx C size % 8 + jz L(exact) + +L(oop): +ifdef(`CPU_P6',` + shr %edx ') C restore 2nd saved carry bit + mov (up), %ecx + adc %ecx, %ecx + rcr %edx C restore 1st saved carry bit + lea 4(up), up + sbb %ecx, (rp) + lea 4(rp), rp + adc %edx, %edx C save a carry bit in edx +ifdef(`CPU_P6',` + adc %edx, %edx ') C save another carry bit in edx + dec %ebx + jnz L(oop) +L(exact): + inc %eax + jz L(end) + mov %eax, VAR_COUNT + mov %ebp, SAVE_EBP + + ALIGN(16) +L(top): +ifdef(`CPU_P6',` + shr %edx ') C restore 2nd saved carry bit + mov (up), %eax + adc %eax, %eax + mov 4(up), %ebx + adc %ebx, %ebx + mov 8(up), %ecx + adc %ecx, %ecx + mov 12(up), %ebp + adc %ebp, %ebp + + rcr %edx C restore 1st saved carry bit + + sbb %eax, (rp) + sbb %ebx, 4(rp) + sbb %ecx, 8(rp) + sbb %ebp, 12(rp) + + mov 16(up), %eax + adc %eax, %eax + mov 20(up), %ebx + adc %ebx, %ebx + mov 24(up), %ecx + adc %ecx, %ecx + mov 28(up), %ebp + adc %ebp, %ebp + + lea 32(up), up + adc %edx, %edx C save a carry bit in edx + + sbb %eax, 16(rp) + sbb %ebx, 20(rp) + sbb %ecx, 24(rp) + sbb %ebp, 28(rp) + +ifdef(`CPU_P6',` + adc %edx, %edx ') C save another carry bit in edx + incl VAR_COUNT + lea 32(rp), rp + jne L(top) + + mov SAVE_EBP, %ebp +L(end): + mov SAVE_EBX, %ebx + +ifdef(`CPU_P6',` + xor %eax, %eax + shr $1, %edx + adc %edx, %eax +',` + adc $0, %edx + mov %edx, %eax +') + pop rp FRAME_popl() + pop up FRAME_popl() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/k8/gmp-mparam.h b/gmp-6.3.0/mpn/x86/k8/gmp-mparam.h new file mode 100644 index 0000000..fa71292 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k8/gmp-mparam.h @@ -0,0 +1,215 @@ +/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2500 MHz K8 Brisbane */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 11 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 36.85% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 44 + +#define DIV_1_VS_MUL_1_PERCENT 251 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 78 +#define MUL_TOOM44_THRESHOLD 136 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 96 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 121 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 46 +#define SQR_TOOM3_THRESHOLD 81 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 300 +#define SQR_TOOM8_THRESHOLD 430 + +#define MULMID_TOOM42_THRESHOLD 50 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 22 + +#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 606, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 927,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 895,11}, { 1791,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,12}, { 1215,13}, \ + { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2431,13}, { 1279,12}, \ + { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4351,13}, { 2431,14}, \ + { 1279,13}, { 2943,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \ + { 4351,14}, { 2303,13}, { 4991,14}, { 2815,15}, \ + { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 158 +#define MUL_FFT_THRESHOLD 7296 + +#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 500, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \ + { 415,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,10}, { 543,11}, { 287,10}, \ + { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2431,14}, { 1279,13}, { 2943,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3839,15}, { 1023,14}, \ + { 2047,13}, { 4223,14}, { 2303,13}, { 4863,14}, \ + { 2815,15}, { 1535,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 167 +#define SQR_FFT_THRESHOLD 5504 + +#define MULLO_BASECASE_THRESHOLD 4 +#define MULLO_DC_THRESHOLD 29 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 193 +#define SQRLO_SQR_THRESHOLD 10704 + +#define DC_DIV_QR_THRESHOLD 84 +#define DC_DIVAPPR_Q_THRESHOLD 278 +#define DC_BDIV_QR_THRESHOLD 87 +#define DC_BDIV_Q_THRESHOLD 216 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 268 +#define INV_APPR_THRESHOLD 268 + +#define BINV_NEWTON_THRESHOLD 276 +#define REDC_1_TO_REDC_N_THRESHOLD 78 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 114 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1466 + +#define POWM_SEC_TABLE 1,22,102,452,1357 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 270 +#define SET_STR_PRECOMPUTE_THRESHOLD 1149 + +#define FAC_DSC_THRESHOLD 208 +#define FAC_ODD_THRESHOLD 48 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 3 /* 4.69% faster than 1 */ +#define HGCD_THRESHOLD 139 +#define HGCD_APPR_THRESHOLD 174 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 599 +#define GCDEXT_DC_THRESHOLD 419 +#define JACOBI_BASE_METHOD 1 /* 1.57% faster than 4 */ + +/* Tuneup completed successfully, took 83851 seconds */ diff --git a/gmp-6.3.0/mpn/x86/lshift.asm b/gmp-6.3.0/mpn/x86/lshift.asm new file mode 100644 index 0000000..6ee6153 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/lshift.asm @@ -0,0 +1,106 @@ +dnl x86 mpn_lshift -- mpn left shift. + +dnl Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P54 7.5 +C P55 7.0 +C P6 2.5 +C K6 4.5 +C K7 5.0 +C P4 14.5 + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_lshift) + + pushl %edi + pushl %esi + pushl %ebx +deflit(`FRAME',12) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%edx + movl PARAM_SHIFT,%ecx + + subl $4,%esi C adjust src + + movl (%esi,%edx,4),%ebx C read most significant limb + xorl %eax,%eax + shldl( %cl, %ebx, %eax) C compute carry limb + decl %edx + jz L(end) + pushl %eax C push carry limb onto stack + testb $1,%dl + jnz L(1) C enter loop in the middle + movl %ebx,%eax + + ALIGN(8) +L(oop): movl (%esi,%edx,4),%ebx C load next lower limb + shldl( %cl, %ebx, %eax) C compute result limb + movl %eax,(%edi,%edx,4) C store it + decl %edx +L(1): movl (%esi,%edx,4),%eax + shldl( %cl, %eax, %ebx) + movl %ebx,(%edi,%edx,4) + decl %edx + jnz L(oop) + + shll %cl,%eax C compute least significant limb + movl %eax,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebx + popl %esi + popl %edi + ret + +L(end): shll %cl,%ebx C compute least significant limb + movl %ebx,(%edi) C store it + + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm b/gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm new file mode 100644 index 0000000..aae158a --- /dev/null +++ b/gmp-6.3.0/mpn/x86/mmx/sec_tabselect.asm @@ -0,0 +1,163 @@ +dnl X86 MMX mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C ali,evn n unal,evn n +C P5 +C P6 model 0-8,10-12 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) 1.33 1.87 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) 2.1 2.63 +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) 1.7 2.57 +C Intel Atom 1.85 2.7 +C AMD K6 +C AMD K7 1.33 1.33 +C AMD K8 +C AMD K10 + +define(`rp', `%edi') +define(`tp', `%esi') +define(`n', `%edx') +define(`nents', `%ecx') +define(`which', `') + +define(`i', `%ebp') +define(`j', `%ebx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sec_tabselect) + push %ebx + push %esi + push %edi + push %ebp + + mov 20(%esp), rp + mov 24(%esp), tp + mov 28(%esp), n + mov 32(%esp), nents + + movd 36(%esp), %mm6 + punpckldq %mm6, %mm6 C 2 copies of `which' + + mov $1, %ebx + movd %ebx, %mm7 + punpckldq %mm7, %mm7 C 2 copies of 1 + + mov n, j + add $-4, j + js L(outer_end) + +L(outer_top): + mov nents, i + mov tp, %eax + pxor %mm1, %mm1 + pxor %mm4, %mm4 + pxor %mm5, %mm5 + ALIGN(16) +L(top): movq %mm6, %mm0 + pcmpeqd %mm1, %mm0 + paddd %mm7, %mm1 + movq (tp), %mm2 + movq 8(tp), %mm3 + pand %mm0, %mm2 + pand %mm0, %mm3 + por %mm2, %mm4 + por %mm3, %mm5 + lea (tp,n,4), tp + add $-1, i + jne L(top) + + movq %mm4, (rp) + movq %mm5, 8(rp) + + lea 16(%eax), tp + lea 16(rp), rp + add $-4, j + jns L(outer_top) +L(outer_end): + + test $2, %dl + jz L(b0x) + +L(b1x): mov nents, i + mov tp, %eax + pxor %mm1, %mm1 + pxor %mm4, %mm4 + ALIGN(16) +L(tp2): movq %mm6, %mm0 + pcmpeqd %mm1, %mm0 + paddd %mm7, %mm1 + movq (tp), %mm2 + pand %mm0, %mm2 + por %mm2, %mm4 + lea (tp,n,4), tp + add $-1, i + jne L(tp2) + + movq %mm4, (rp) + + lea 8(%eax), tp + lea 8(rp), rp + +L(b0x): test $1, %dl + jz L(b00) + +L(b01): mov nents, i + pxor %mm1, %mm1 + pxor %mm4, %mm4 + ALIGN(16) +L(tp1): movq %mm6, %mm0 + pcmpeqd %mm1, %mm0 + paddd %mm7, %mm1 + movd (tp), %mm2 + pand %mm0, %mm2 + por %mm2, %mm4 + lea (tp,n,4), tp + add $-1, i + jne L(tp1) + + movd %mm4, (rp) + +L(b00): pop %ebp + pop %edi + pop %esi + pop %ebx + emms + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86/mod_34lsub1.asm new file mode 100644 index 0000000..e09e702 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/mod_34lsub1.asm @@ -0,0 +1,183 @@ +dnl Generic x86 mpn_mod_34lsub1 -- mpn remainder modulo 2^24-1. + +dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 3.0 +C P6 3.66 +C K6 3.0 +C K7 1.3 +C P4 9 + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EBX, `PARAM_SRC') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + + subl $2, %ecx + ja L(three_or_more) + + movl (%edx), %eax + jb L(one) + + movl 4(%edx), %ecx + movl %eax, %edx + shrl $24, %eax C src[0] low + + andl $0xFFFFFF, %edx C src[0] high + addl %edx, %eax + movl %ecx, %edx + + andl $0xFFFF, %ecx + shrl $16, %edx C src[1] high + addl %edx, %eax + + shll $8, %ecx C src[1] low + addl %ecx, %eax + +L(one): + ret + + +L(three_or_more): + C eax + C ebx + C ecx size-2 + C edx src + C esi + C edi + C ebp + + movl %ebx, SAVE_EBX C and arrange 16-byte loop alignment + xorl %ebx, %ebx + + pushl %esi FRAME_pushl() + xorl %esi, %esi + + pushl %edi FRAME_pushl() + xorl %eax, %eax C and clear carry flag + + + C offset 0x40 here +L(top): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx counter, limbs + C edx src + C esi acc 2mod3 + C edi + C ebp + + leal 12(%edx), %edx + leal -2(%ecx), %ecx + + adcl -12(%edx), %eax + adcl -8(%edx), %ebx + adcl -4(%edx), %esi + + decl %ecx + jg L(top) + + + C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively + + movl $0xFFFFFFFF, %edi + incl %ecx + js L(combine) + + adcl (%edx), %eax + movl $0xFFFFFF00, %edi + decl %ecx + js L(combine) + + adcl 4(%edx), %ebx + movl $0xFFFF0000, %edi + + +L(combine): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx + C edx + C esi acc 2mod3 + C edi mask + C ebp + + sbbl %ecx, %ecx C carry + movl %eax, %edx C 0mod3 + + shrl $24, %eax C 0mod3 high + andl %edi, %ecx C carry masked + + subl %ecx, %eax C apply carry + movl %ebx, %edi C 1mod3 + + shrl $16, %ebx C 1mod3 high + andl $0x00FFFFFF, %edx C 0mod3 low + + addl %edx, %eax C apply 0mod3 low + andl $0xFFFF, %edi + + shll $8, %edi C 1mod3 low + addl %ebx, %eax C apply 1mod3 high + + addl %edi, %eax C apply 1mod3 low + movl %esi, %edx C 2mod3 + + shrl $8, %esi C 2mod3 high + andl $0xFF, %edx C 2mod3 low + + shll $16, %edx C 2mod3 low + addl %esi, %eax C apply 2mod3 high + + addl %edx, %eax C apply 2mod3 low + popl %edi FRAME_popl() + + movl SAVE_EBX, %ebx + popl %esi FRAME_popl() + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/mul_1.asm b/gmp-6.3.0/mpn/x86/mul_1.asm new file mode 100644 index 0000000..421de62 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/mul_1.asm @@ -0,0 +1,140 @@ +dnl x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector +dnl with a limb and store the result in a second limb vector. + +dnl Copyright 1992, 1994, 1997-2002, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 12.5 +C P6 model 0-8,10-12 5.5 +C P6 model 9 (Banias) +C P6 model 13 (Dothan) 5.25 +C P4 model 0 (Willamette) 19.0 +C P4 model 1 (?) 19.0 +C P4 model 2 (Northwood) 19.0 +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 10.5 +C AMD K7 4.5 +C AMD K8 + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_mul_1) +deflit(`FRAME',0) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ecx + + xorl %ebx,%ebx + andl $3,%ecx + jz L(end0) + +L(oop0): + movl (%esi),%eax + mull PARAM_MULTIPLIER + leal 4(%esi),%esi + addl %ebx,%eax + movl $0,%ebx + adcl %ebx,%edx + movl %eax,(%edi) + movl %edx,%ebx C propagate carry into cylimb + + leal 4(%edi),%edi + decl %ecx + jnz L(oop0) + +L(end0): + movl PARAM_SIZE,%ecx + shrl $2,%ecx + jz L(end) + + + ALIGN(8) +L(oop): movl (%esi),%eax + mull PARAM_MULTIPLIER + addl %eax,%ebx + movl $0,%ebp + adcl %edx,%ebp + + movl 4(%esi),%eax + mull PARAM_MULTIPLIER + movl %ebx,(%edi) + addl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl 8(%esi),%eax + mull PARAM_MULTIPLIER + movl %ebp,4(%edi) + addl %eax,%ebx C new lo + cylimb + movl $0,%ebp + adcl %edx,%ebp + + movl 12(%esi),%eax + mull PARAM_MULTIPLIER + movl %ebx,8(%edi) + addl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl %ebp,12(%edi) + + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ecx + jnz L(oop) + +L(end): movl %ebx,%eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/mul_basecase.asm b/gmp-6.3.0/mpn/x86/mul_basecase.asm new file mode 100644 index 0000000..8339732 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/mul_basecase.asm @@ -0,0 +1,223 @@ +dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result +dnl in a third limb vector. + +dnl Copyright 1996-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/crossproduct +C P5 15 +C P6 7.5 +C K6 12.5 +C K7 5.5 +C P4 24 + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C This was written in a haste since the Pentium optimized code that was used +C for all x86 machines was slow for the Pentium II. This code would benefit +C from some cleanup. +C +C To shave off some percentage of the run-time, one should make 4 variants +C of the Louter loop, for the four different outcomes of un mod 4. That +C would avoid Loop0 altogether. Code expansion would be > 4-fold for that +C part of the function, but since it is not very large, that would be +C acceptable. +C +C The mul loop (at L(oopM)) might need some tweaking. It's current speed is +C unknown. + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_MULTIPLIER, -4) +defframe(VAR_COUNTER, -8) +deflit(VAR_STACK_SPACE, 8) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + subl $VAR_STACK_SPACE,%esp + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',eval(VAR_STACK_SPACE+12)) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + + leal 4(%esi),%esi + leal 4(%edi),%edi + +L(oopM): + movl (%esi),%eax C load next limb at xp[j] + leal 4(%esi),%esi + mull (%ebp) + addl %ebx,%eax + movl %edx,%ebx + adcl $0,%ebx + movl %eax,(%edi) + leal 4(%edi),%edi + decl %ecx + jnz L(oopM) + + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + movl PARAM_XSIZE,%eax + shll $2,%eax + subl %eax,%edi + subl %eax,%esi + + movl PARAM_YSIZE,%eax C ysize + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + movl PARAM_YP,%ebp C yp + addl $4,%ebp C make ebp point to next v limb + movl %ebp,PARAM_YP + movl (%ebp),%eax C copy y limb ... + movl %eax,VAR_MULTIPLIER C ... to stack slot + movl PARAM_XSIZE,%ecx + + xorl %ebx,%ebx + andl $3,%ecx + jz L(end0) + +L(oop0): + movl (%esi),%eax + mull VAR_MULTIPLIER + leal 4(%esi),%esi + addl %ebx,%eax + movl $0,%ebx + adcl %ebx,%edx + addl %eax,(%edi) + adcl %edx,%ebx C propagate carry into cylimb + + leal 4(%edi),%edi + decl %ecx + jnz L(oop0) + +L(end0): + movl PARAM_XSIZE,%ecx + shrl $2,%ecx + jz L(endX) + + ALIGN(8) +L(oopX): + movl (%esi),%eax + mull VAR_MULTIPLIER + addl %eax,%ebx + movl $0,%ebp + adcl %edx,%ebp + + movl 4(%esi),%eax + mull VAR_MULTIPLIER + addl %ebx,(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl 8(%esi),%eax + mull VAR_MULTIPLIER + addl %ebp,4(%edi) + adcl %eax,%ebx C new lo + cylimb + movl $0,%ebp + adcl %edx,%ebp + + movl 12(%esi),%eax + mull VAR_MULTIPLIER + addl %ebx,8(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + addl %ebp,12(%edi) + adcl $0,%ebx C propagate carry into cylimb + + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ecx + jnz L(oopX) + +L(endX): + movl %ebx,(%edi) + addl $4,%edi + + C we incremented wp and xp in the loop above; compensate + movl PARAM_XSIZE,%eax + shll $2,%eax + subl %eax,%edi + subl %eax,%esi + + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $8,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + addl $8,%esp + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/nano/gmp-mparam.h b/gmp-6.3.0/mpn/x86/nano/gmp-mparam.h new file mode 100644 index 0000000..cd8ac4e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/nano/gmp-mparam.h @@ -0,0 +1,162 @@ +/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */ + +#define MOD_1_1P_METHOD 1 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 53 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 32 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 132 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 130 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 135 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 194 +#define SQR_TOOM4_THRESHOLD 502 +#define SQR_TOOM6_THRESHOLD 746 +#define SQR_TOOM8_THRESHOLD 1005 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define POWM_SEC_TABLE 4,23,258,828,2246 + +#define MUL_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 308, 5}, { 13, 6}, { 7, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 11, 5}, { 23, 6}, \ + { 13, 7}, { 7, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 11, 6}, { 24, 7}, { 15, 6}, \ + { 31, 7}, { 19, 8}, { 11, 7}, { 25, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 47,10}, \ + { 31, 9}, { 71,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 543, 9}, \ + { 287, 8}, { 575, 7}, { 1215,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \ + { 607, 8}, { 1215,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351, 9}, { 703, 8}, { 1407, 9}, \ + { 735, 8}, { 1471,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,10}, { 479, 9}, { 959, 8}, \ + { 1919,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 89 +#define MUL_FFT_THRESHOLD 1856 + +#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 396, 5}, { 13, 6}, { 7, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 15, 6}, { 31, 7}, { 19, 6}, \ + { 39, 7}, { 21, 8}, { 11, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 543,10}, { 143, 9}, \ + { 287, 8}, { 607, 7}, { 1215, 6}, { 2431,10}, \ + { 159, 8}, { 639,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 607, 8}, { 1215,11}, \ + { 159,10}, { 319, 9}, { 671,10}, { 351, 9}, \ + { 703, 8}, { 1407, 9}, { 735, 8}, { 1471, 7}, \ + { 2943,11}, { 191,10}, { 383, 9}, { 799,10}, \ + { 415, 9}, { 895,10}, { 479,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 87 +#define SQR_FFT_THRESHOLD 2368 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 51 +#define MULLO_MUL_N_THRESHOLD 3369 + +#define DC_DIV_QR_THRESHOLD 56 +#define DC_DIVAPPR_Q_THRESHOLD 183 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 118 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 266 +#define INV_APPR_THRESHOLD 218 + +#define BINV_NEWTON_THRESHOLD 268 +#define REDC_1_TO_REDC_N_THRESHOLD 56 + +#define MU_DIV_QR_THRESHOLD 1308 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 124 +#define MU_BDIV_QR_THRESHOLD 855 +#define MU_BDIV_Q_THRESHOLD 1334 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD_THRESHOLD 104 +#define HGCD_APPR_THRESHOLD 139 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 456 +#define GCDEXT_DC_THRESHOLD 321 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 542 +#define SET_STR_PRECOMPUTE_THRESHOLD 840 diff --git a/gmp-6.3.0/mpn/x86/p6/README b/gmp-6.3.0/mpn/x86/p6/README new file mode 100644 index 0000000..f19d47b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/README @@ -0,0 +1,125 @@ +Copyright 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + INTEL P6 MPN SUBROUTINES + + + +This directory contains code optimized for Intel P6 class CPUs, meaning +PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories +have routines using MMX instructions. + + + +STATUS + +Times for the loops, with all code and data in L1 cache, are as follows. +Some of these might be able to be improved. + + cycles/limb + + mpn_add_n/sub_n 3.7 + + mpn_copyi 0.75 + mpn_copyd 1.75 (or 0.75 if no overlap) + + mpn_divrem_1 39.0 + mpn_mod_1 21.5 + mpn_divexact_by3 8.5 + + mpn_mul_1 5.5 + mpn_addmul/submul_1 6.35 + + mpn_l/rshift 2.5 + + mpn_mul_basecase 8.2 cycles/crossproduct (approx) + mpn_sqr_basecase 4.0 cycles/crossproduct (approx) + or 7.75 cycles/triangleproduct (approx) + +Pentium II and III have MMX and get the following improvements. + + mpn_divrem_1 25.0 integer part, 17.5 fractional part + + mpn_l/rshift 1.75 + + + + +NOTES + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Mispredicted branches have a penalty of between 9 and 15 cycles, and even up +to 26 cycles depending how far speculative execution has gone. The 9 cycle +minimum penalty comes from the issue pipeline being 9 stages. + +A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4, +5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3 +cycles per 16 byte block. + + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three instructions with no successive +dependencies, and with only the first being a multiple micro-op. + +P6 has out-of-order execution, so the groupings are really only showing +dependent paths where some shuffling might allow some latencies to be +hidden. + + + + +REFERENCES + +"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated +02/99, order number 245127 (order number 730795-001 is in the document too). +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/245127.htm + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is an older document mostly about P5 and not as good as the above. +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/x86/p6/aors_n.asm b/gmp-6.3.0/mpn/x86/p6/aors_n.asm new file mode 100644 index 0000000..df51c2e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/aors_n.asm @@ -0,0 +1,156 @@ +dnl Intel P6 mpn_add_n/mpn_sub_n -- mpn add or subtract. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Avoid indexed addressing, it makes us stall on the two-ported register +C file. + +C cycles/limb +C P6 model 0-8,10-12 3.17 +C P6 model 9 (Banias) 2.15 +C P6 model 13 (Dothan) 2.25 + + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebx') +define(`n', `%ecx') + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() + + TEXT + ALIGN(16) + +PROLOGUE(func) + xor %edx, %edx +L(start): + push %edi + push %esi + push %ebx + + mov 16(%esp), rp + mov 20(%esp), up + mov 24(%esp), vp + mov 28(%esp), n + + lea (up,n,4), up + lea (vp,n,4), vp + lea (rp,n,4), rp + + neg n + mov n, %eax + and $-8, n + and $7, %eax + shl $2, %eax C 4x +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + lea L(ent) (%eax,%eax,2), %eax C 12x +') + + shr %edx C set cy flag + jmp *%eax + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + lea (%eax,%eax,2), %eax + add $L(ent)-L(here), %eax + add (%esp), %eax + ret_internal +') + +L(end): + sbb %eax, %eax + neg %eax + pop %ebx + pop %esi + pop %edi + ret + + ALIGN(16) +L(top): + jecxz L(end) +L(ent): +Zdisp( mov, 0,(up,n,4), %eax) +Zdisp( ADCSBB, 0,(vp,n,4), %eax) +Zdisp( mov, %eax, 0,(rp,n,4)) + + mov 4(up,n,4), %edx + ADCSBB 4(vp,n,4), %edx + mov %edx, 4(rp,n,4) + + mov 8(up,n,4), %eax + ADCSBB 8(vp,n,4), %eax + mov %eax, 8(rp,n,4) + + mov 12(up,n,4), %edx + ADCSBB 12(vp,n,4), %edx + mov %edx, 12(rp,n,4) + + mov 16(up,n,4), %eax + ADCSBB 16(vp,n,4), %eax + mov %eax, 16(rp,n,4) + + mov 20(up,n,4), %edx + ADCSBB 20(vp,n,4), %edx + mov %edx, 20(rp,n,4) + + mov 24(up,n,4), %eax + ADCSBB 24(vp,n,4), %eax + mov %eax, 24(rp,n,4) + + mov 28(up,n,4), %edx + ADCSBB 28(vp,n,4), %edx + mov %edx, 28(rp,n,4) + + lea 8(n), n + jmp L(top) + +EPILOGUE() + +PROLOGUE(func_nc) + movl 20(%esp), %edx + jmp L(start) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm b/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm new file mode 100644 index 0000000..bc8c49c --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm @@ -0,0 +1,320 @@ +dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. + +dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 +C P6 model 0-8,10-12 6.44 +C P6 model 9 (Banias) 6.15 +C P6 model 13 (Dothan) 6.11 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C AMD K6 +C AMD K7 +C AMD K8 + + +dnl P6 UNROLL_COUNT cycles/limb +dnl 8 6.7 +dnl 16 6.35 +dnl 32 6.3 +dnl 64 6.3 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. +C +C This code is pretty much the same as the K6 code. The unrolled loop is +C the same, but there's just a few scheduling tweaks in the setups and the +C simple loop. +C +C A number of variations have been tried for the unrolled loop, with one or +C two carries, and with loads scheduled earlier, but nothing faster than 6 +C cycles/limb has been found. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %ebx +deflit(`FRAME',4) + movl PARAM_CARRY, %ebx + jmp L(start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %ebx +deflit(`FRAME',4) + xorl %ebx, %ebx C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %esi +deflit(`FRAME',8) + + movl PARAM_SRC, %esi + pushl %edi +deflit(`FRAME',12) + + movl PARAM_DST, %edi + pushl %ebp +deflit(`FRAME',16) + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_MULTIPLIER, %ebp + jae L(unroll) + + + C simple loop + C this is offset 0x22, so close enough to aligned +L(simple): + C eax scratch + C ebx carry + C ecx counter + C edx scratch + C esi src + C edi dst + C ebp multiplier + + movl (%esi), %eax + addl $4, %edi + + mull %ebp + + addl %ebx, %eax + adcl $0, %edx + + M4_inst %eax, -4(%edi) + movl %edx, %ebx + + adcl $0, %ebx + decl %ecx + + leal 4(%esi), %esi + jnz L(simple) + + + popl %ebp + popl %edi + + popl %esi + movl %ebx, %eax + + popl %ebx + ret + + + +C------------------------------------------------------------------------------ +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers when doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %ebx is necessary only for the +C mpn_add/submul_1c entry points. Duplicating the startup code to +C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER,`PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + + C this is offset 0x43, so close enough to aligned +L(unroll): + C eax + C ebx initial carry + C ecx size + C edx + C esi src + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + shll $4, %edx + negl %ecx + + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%esi), %eax C src low limb + + movl %edx, VAR_JUMP + leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi + + mull %ebp + + addl %ebx, %eax C initial carry (from _1c) + adcl $0, %edx + + movl %edx, %ebx C high carry + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi + + movl VAR_JUMP, %edx + testl $1, %ecx + movl %eax, %ecx C low carry + + cmovnz( %ebx, %ecx) C high,low carry other way around + cmovnz( %eax, %ebx) + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + shll $4, %edx + negl %ecx + + C See mpn/x86/README about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + + addl (%esp), %edx + + ret_internal +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx carry hi + C ecx carry lo + C edx scratch + C esi src + C edi dst + C ebp multiplier + C + C VAR_COUNTER loop counter + C + C 15 code bytes per limb + + addl $UNROLL_BYTES, %edi + +L(entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx + + movl disp1(%esi), %eax + mull %ebp + M4_inst %ebx, disp1(%edi) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + + jns L(top) + + +deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) + + M4_inst %ecx, disp0(%edi) + movl %ebx, %eax + + popl %ebp + popl %edi + + popl %esi + popl %ebx + adcl $0, %eax + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm new file mode 100644 index 0000000..a0a9d90 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm @@ -0,0 +1,287 @@ +dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C odd even divisor +C P6: 10.0 12.0 cycles/limb + +C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) + +C The odd case is basically the same as mpn_modexact_1_odd, just with an +C extra store, and it runs at the same 10 cycles which is the dependent +C chain. +C +C The shifts for the even case aren't on the dependent chain so in principle +C it could run the same too, but nothing running at 10 has been found. +C Perhaps there's too many uops (an extra 4 over the odd case). + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(STACK_SPACE, 16) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_SRC') + + TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + + ALIGN(16) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + movl %ebp, SAVE_EBP + movl PARAM_INVERSE, %ebp + + movl PARAM_SHIFT, %ecx C trailing twos + +L(common): + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal (%esi,%ebx,4), %esi C src end + + leal (%edi,%ebx,4), %edi C dst end + negl %ebx C -size + + movl (%esi,%ebx,4), %eax C src[0] + + orl %ecx, %ecx + jz L(odd_entry) + + movl %edi, PARAM_DST + movl %ebp, VAR_INVERSE + +L(even): + C eax src[0] + C ebx counter, limbs, negative + C ecx shift + C edx + C esi + C edi + C ebp + + xorl %ebp, %ebp C initial carry bit + xorl %edx, %edx C initial carry limb (for size==1) + + incl %ebx + jz L(even_one) + + movl (%esi,%ebx,4), %edi C src[1] + + shrdl( %cl, %edi, %eax) + + jmp L(even_entry) + + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx shift + C edx scratch + C esi &src[size] + C edi &dst[size] and scratch + C ebp carry bit + + movl (%esi,%ebx,4), %edi + + mull PARAM_DIVISOR + + movl -4(%esi,%ebx,4), %eax + shrdl( %cl, %edi, %eax) + + subl %ebp, %eax + + sbbl %ebp, %ebp + subl %edx, %eax + + sbbl $0, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl PARAM_DST, %edi + negl %ebp + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + jnz L(even_top) + + mull PARAM_DIVISOR + + movl -4(%esi), %eax + +L(even_one): + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebp, %eax + movl SAVE_EBP, %ebp + + subl %edx, %eax + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +C The dependent chain here is +C +C subl %edx, %eax 1 +C imull %ebp, %eax 4 +C mull PARAM_DIVISOR 5 +C ---- +C total 10 +C +C and this is the measured speed. No special scheduling is necessary, out +C of order execution hides the load latency. + +L(odd_top): + C eax scratch (src limb) + C ebx counter, limbs, negative + C ecx carry bit + C edx carry limb, high of last product + C esi &src[size] + C edi &dst[size] + C ebp inverse + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ecx, %eax + + sbbl %ecx, %ecx + subl %edx, %eax + + sbbl $0, %ecx + +L(odd_entry): + imull %ebp, %eax + + movl %eax, (%edi,%ebx,4) + negl %ecx + + incl %ebx + jnz L(odd_top) + + + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() + +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C + + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + bsfl %eax, %ecx C trailing twos + + movl %ebp, SAVE_EBP + + shrl %cl, %eax C d without twos + + movl %eax, %edx + shrl %eax C d/2 without twos + + movl %edx, PARAM_DIVISOR + andl $127, %eax + +ifdef(`PIC',` + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %ebp C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %ebp C inv 8 bits +') + + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + imull %edx, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + + jmp L(common) + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/p6/copyd.asm b/gmp-6.3.0/mpn/x86/p6/copyd.asm new file mode 100644 index 0000000..1be7636 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/copyd.asm @@ -0,0 +1,178 @@ +dnl Intel P6 mpn_copyd -- copy limb vector backwards. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: 1.75 cycles/limb, or 0.75 if no overlap + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C An explicit loop is used because a decrementing rep movsl is a bit slow at +C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the +C code here stands a chance of being faster if the branches predict well. +C +C The slightly strange loop form seems necessary for the claimed speed. +C Maybe load/store ordering affects it. +C +C The source and destination are checked to see if they're actually +C overlapping, since it might be possible to use an incrementing rep movsl +C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing +C version.) +C +C Enhancements: +C +C Top speed for an all-integer copy is probably 1.0 c/l, being one load and +C one store each cycle. Unrolling the loop below would approach 1.0, but +C it'd be good to know why something like store/load/subl + store/load/jnz +C doesn't already run at 1.0 c/l. It looks like it should decode in 2 +C cycles, but doesn't run that way. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-using parameter space +define(SAVE_ESI,`PARAM_SIZE') +define(SAVE_EDI,`PARAM_SRC') + + TEXT + ALIGN(16) + +PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + subl $1, %ecx + jb L(zero) + + movl (%esi,%ecx,4), %eax C src[size-1] + jz L(one) + + movl -4(%esi,%ecx,4), %edx C src[size-2] + subl $2, %ecx + jbe L(done_loop) C 2 or 3 limbs only + + + C The usual overlap is + C + C high low + C +------------------+ + C | dst| + C +------------------+ + C +------------------+ + C | src| + C +------------------+ + C + C We can use an incrementing copy in the following circumstances. + C + C src+4*size<=dst, since then the regions are disjoint + C + C src==dst, clearly (though this shouldn't occur normally) + C + C src>dst, since in that case it's a requirement of the + C parameters that src>=dst+size*4, and hence the + C regions are disjoint + C + + leal (%edi,%ecx,4), %edx + cmpl %edi, %esi + jae L(use_movsl) C src >= dst + + cmpl %edi, %edx + movl 4(%esi,%ecx,4), %edx C src[size-2] again + jbe L(use_movsl) C src+4*size <= dst + + +L(top): + C eax prev high limb + C ebx + C ecx counter, size-3 down to 0 or -1, inclusive, by 2s + C edx prev low limb + C esi src + C edi dst + C ebp + + movl %eax, 8(%edi,%ecx,4) + movl (%esi,%ecx,4), %eax + + movl %edx, 4(%edi,%ecx,4) + movl -4(%esi,%ecx,4), %edx + + subl $2, %ecx + jnbe L(top) + + +L(done_loop): + movl %eax, 8(%edi,%ecx,4) + movl %edx, 4(%edi,%ecx,4) + + C copy low limb (needed if size was odd, but will already have been + C done in the loop if size was even) + movl (%esi), %eax +L(one): + movl %eax, (%edi) + movl SAVE_EDI, %edi + movl SAVE_ESI, %esi + + ret + + +L(use_movsl): + C eax + C ebx + C ecx size-3 + C edx + C esi src + C edi dst + C ebp + + addl $3, %ecx + + cld C better safe than sorry, see mpn/x86/README + + rep + movsl + +L(zero): + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/p6/dive_1.asm b/gmp-6.3.0/mpn/x86/p6/dive_1.asm new file mode 100644 index 0000000..7d61a18 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/dive_1.asm @@ -0,0 +1,267 @@ +dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C odd even divisor +C P6: 10.0 12.0 cycles/limb + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C The odd case is basically the same as mpn_modexact_1_odd, just with an +C extra store, and it runs at the same 10 cycles which is the dependent +C chain. +C +C The shifts for the even case aren't on the dependent chain so in principle +C it could run the same too, but nothing running at 10 has been found. +C Perhaps there's too many uops (an extra 4 over the odd case). + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_INVERSE, -20) +deflit(STACK_SPACE, 20) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + bsfl %eax, %ecx C trailing twos + + movl %ebp, SAVE_EBP + + shrl %cl, %eax C d without twos + + movl %eax, %edx + shrl %eax C d/2 without twos + + movl %edx, PARAM_DIVISOR + andl $127, %eax + +ifdef(`PIC',` + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %ebp C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %ebp C inv 8 bits +') + + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal (%esi,%ebx,4), %esi C src end + + imull PARAM_DIVISOR, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + + leal (%edi,%ebx,4), %edi C dst end + negl %ebx C -size + + movl %edi, PARAM_DST + + imull PARAM_DIVISOR, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + + ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS + movl PARAM_DIVISOR, %eax + imull %ebp, %eax + cmpl $1, %eax') + + movl %ebp, VAR_INVERSE + movl (%esi,%ebx,4), %eax C src[0] + + orl %ecx, %ecx + jnz L(even) + + C ecx initial carry is zero + jmp L(odd_entry) + + +C The dependent chain here is +C +C subl %edx, %eax 1 +C imull %ebp, %eax 4 +C mull PARAM_DIVISOR 5 +C ---- +C total 10 +C +C and this is the measured speed. No special scheduling is necessary, out +C of order execution hides the load latency. + +L(odd_top): + C eax scratch (src limb) + C ebx counter, limbs, negative + C ecx carry bit + C edx carry limb, high of last product + C esi &src[size] + C edi &dst[size] + C ebp + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ecx, %eax + + sbbl %ecx, %ecx + subl %edx, %eax + + sbbl $0, %ecx + +L(odd_entry): + imull VAR_INVERSE, %eax + + movl %eax, (%edi,%ebx,4) + negl %ecx + + incl %ebx + jnz L(odd_top) + + + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + ret + + +L(even): + C eax src[0] + C ebx counter, limbs, negative + C ecx shift + C edx + C esi + C edi + C ebp + + xorl %ebp, %ebp C initial carry bit + xorl %edx, %edx C initial carry limb (for size==1) + + incl %ebx + jz L(even_one) + + movl (%esi,%ebx,4), %edi C src[1] + + shrdl( %cl, %edi, %eax) + + jmp L(even_entry) + + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx shift + C edx scratch + C esi &src[size] + C edi &dst[size] and scratch + C ebp carry bit + + movl (%esi,%ebx,4), %edi + + mull PARAM_DIVISOR + + movl -4(%esi,%ebx,4), %eax + shrdl( %cl, %edi, %eax) + + subl %ebp, %eax + + sbbl %ebp, %ebp + subl %edx, %eax + + sbbl $0, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl PARAM_DST, %edi + negl %ebp + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + jnz L(even_top) + + + + mull PARAM_DIVISOR + + movl -4(%esi), %eax + +L(even_one): + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebp, %eax + movl SAVE_EBP, %ebp + + subl %edx, %eax + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/p6/gcd_11.asm b/gmp-6.3.0/mpn/x86/p6/gcd_11.asm new file mode 100644 index 0000000..80e055e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/gcd_11.asm @@ -0,0 +1,83 @@ +dnl x86 mpn_gcd_11 optimised for processors with fast BSF. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked by Torbjorn Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K7 7.80 +C AMD K8,K9 7.79 +C AMD K10 4.08 +C AMD bd1 ? +C AMD bobcat 7.82 +C Intel P4-2 14.9 +C Intel P4-3/4 14.0 +C Intel P6/13 5.09 +C Intel core2 4.22 +C Intel NHM 5.00 +C Intel SBR 5.00 +C Intel atom 17.1 +C VIA nano ? +C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 + + +define(`u0', `%eax') +define(`v0', `%edx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + push %edi + push %esi + + mov 12(%esp), %eax + mov 16(%esp), %edx + jmp L(odd) + + ALIGN(16) C K10 BD C2 NHM SBR +L(top): cmovc( %esi, %eax) C u = |v - u| 0,3 0,3 0,6 0,5 0,5 + cmovc( %edi, %edx) C v = min(u,v) 0,3 0,3 2,8 1,7 1,7 + shr %cl, %eax C 1,7 1,6 2,8 2,8 2,8 +L(odd): mov %edx, %esi C 1 1 4 3 3 + sub %eax, %esi C 2 2 5 4 4 + bsf %esi, %ecx C 3 3 6 5 5 + mov %eax, %edi C 2 2 3 3 4 + sub %edx, %eax C 2 2 4 3 4 + jnz L(top) C + +L(end): mov %edx, %eax + pop %esi + pop %edi + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h b/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h new file mode 100644 index 0000000..96c96fd --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h @@ -0,0 +1,194 @@ +/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2008-2010, 2012 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the + value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in + mpn/x86/p6/sqr_basecase.asm. */ + + +/* 1867 MHz P6 model 13 */ + +#define MOD_1_NORM_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 74 +#define MUL_TOOM44_THRESHOLD 181 +#define MUL_TOOM6H_THRESHOLD 252 +#define MUL_TOOM8H_THRESHOLD 363 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 58 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 4,23,258,768,2388 + +#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \ + { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \ + { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \ + { 415,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 132 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 472 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \ + { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \ + { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \ + { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 146 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 13463 + +#define DC_DIV_QR_THRESHOLD 20 +#define DC_DIVAPPR_Q_THRESHOLD 56 +#define DC_BDIV_QR_THRESHOLD 60 +#define DC_BDIV_Q_THRESHOLD 134 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 66 +#define INV_APPR_THRESHOLD 63 + +#define BINV_NEWTON_THRESHOLD 250 +#define REDC_1_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1164 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 38 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD_THRESHOLD 64 +#define HGCD_APPR_THRESHOLD 105 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 309 +#define JACOBI_BASE_METHOD 1 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 587 +#define SET_STR_PRECOMPUTE_THRESHOLD 1104 diff --git a/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm b/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm new file mode 100644 index 0000000..7ada213 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm @@ -0,0 +1,169 @@ +dnl Intel P6 mpn_lshsub_n -- mpn papillion support. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12) + +C (1) The loop is not scheduled in any way, and scheduling attempts have not +C improved speed on P6/13. Presumably, the K7 will want scheduling, if it +C at all wants to use MMX. +C (2) We could save a register by not alternatingly using eax and edx in the +C loop. + +define(`rp', `%edi') +define(`up', `%esi') +define(`vp', `%ebx') +define(`n', `%ecx') +define(`cnt', `%mm7') + +ASM_START() + + TEXT + ALIGN(16) + +PROLOGUE(mpn_lshsub_n) + push %edi + push %esi + push %ebx + + mov 16(%esp), rp + mov 20(%esp), up + mov 24(%esp), vp + mov 28(%esp), n + mov $32, %eax + sub 32(%esp), %eax + movd %eax, cnt + + lea (up,n,4), up + lea (vp,n,4), vp + lea (rp,n,4), rp + + neg n + mov n, %eax + and $-8, n + and $7, %eax + shl %eax C eax = 2x + lea (%eax,%eax,4), %edx C edx = 10x +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + lea L(ent)(%eax,%edx,2), %eax C eax = 22x +') + + pxor %mm1, %mm1 + pxor %mm0, %mm0 + + jmp *%eax + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + lea (%eax,%edx,2), %eax + add $L(ent)-L(here), %eax + add (%esp), %eax + ret_internal +') + +L(end): C compute (cy<>(32-cnt)) + sbb %eax, %eax + neg %eax + mov 32(%esp), %ecx + shld %cl, %edx, %eax + + emms + + pop %ebx + pop %esi + pop %edi + ret + ALIGN(16) +L(top): jecxz L(end) +L(ent): mov 0(up,n,4), %eax + sbb 0(vp,n,4), %eax + movd %eax, %mm0 + punpckldq %mm0, %mm1 + psrlq %mm7, %mm1 + movd %mm1, 0(rp,n,4) + + mov 4(up,n,4), %edx + sbb 4(vp,n,4), %edx + movd %edx, %mm1 + punpckldq %mm1, %mm0 + psrlq %mm7, %mm0 + movd %mm0, 4(rp,n,4) + + mov 8(up,n,4), %eax + sbb 8(vp,n,4), %eax + movd %eax, %mm0 + punpckldq %mm0, %mm1 + psrlq %mm7, %mm1 + movd %mm1, 8(rp,n,4) + + mov 12(up,n,4), %edx + sbb 12(vp,n,4), %edx + movd %edx, %mm1 + punpckldq %mm1, %mm0 + psrlq %mm7, %mm0 + movd %mm0, 12(rp,n,4) + + mov 16(up,n,4), %eax + sbb 16(vp,n,4), %eax + movd %eax, %mm0 + punpckldq %mm0, %mm1 + psrlq %mm7, %mm1 + movd %mm1, 16(rp,n,4) + + mov 20(up,n,4), %edx + sbb 20(vp,n,4), %edx + movd %edx, %mm1 + punpckldq %mm1, %mm0 + psrlq %mm7, %mm0 + movd %mm0, 20(rp,n,4) + + mov 24(up,n,4), %eax + sbb 24(vp,n,4), %eax + movd %eax, %mm0 + punpckldq %mm0, %mm1 + psrlq %mm7, %mm1 + movd %mm1, 24(rp,n,4) + + mov 28(up,n,4), %edx + sbb 28(vp,n,4), %edx + movd %edx, %mm1 + punpckldq %mm1, %mm0 + psrlq %mm7, %mm0 + movd %mm0, 28(rp,n,4) + + lea 8(n), n + jmp L(top) + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm b/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm new file mode 100644 index 0000000..5300616 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm @@ -0,0 +1,767 @@ +dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); +C +C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, +C see that file for some comments. It's possible what's here can be improved. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The different speeds of the integer and fraction parts means that using +dnl xsize+size isn't quite right. The threshold wants to be a bit higher +dnl for the integer part and a bit lower for the fraction part. (Or what's +dnl really wanted is to speed up the integer part!) +dnl +dnl The threshold is set to make the integer part right. At 4 limbs the +dnl div and mul are about the same there, but on the fractional part the +dnl mul is much faster. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + TEXT + ALIGN(16) + +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + movl PARAM_XSIZE, %ecx + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_DST, %edx + + movl -4(%esi,%ebx,4), %eax C src high limb + xorl %edi, %edi C initial carry (if can't skip a div) + + C + + leal 8(%edx,%ecx,4), %edx C &dst[xsize+2] + xor %ecx, %ecx + + movl %edx, VAR_DST_STOP C &dst[xsize+2] + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edi) C high is carry if high n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + subl $4, %ecx + + movl %ebx, (%ecx) + cmpl %eax, %ecx + + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 2 cycles (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl PARAM_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_DST_STOP, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + C + + subl %eax, %esi + movl PARAM_XSIZE, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + + orl %eax, %eax C xsize + jnz L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx +L(zero_done): + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + movl %ecx, VAR_DST + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + + movl $-1, (%ecx) + movd %mm0, %esi C next n10 + + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C mul 5 +C q1+1 1 (addl) +C mul 5 +C sub 3 (negl/sbbl) +C addback 2 (cmov) +C --- +C 16 +C +C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for +C the addback was found to be a touch slower. + + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx C &dst[xsize+2] + movl %edi, %eax + + subl $8, %ecx C &dst[xsize] + + + ALIGN(16) +L(fraction_top): + C eax n2, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi n2 + C ebp divisor + + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + movl %eax, %edi C remainder->n2 + cmpl %esi, %ecx + + movl %ebx, (%ecx) C previous q + jne L(fraction_top) + + + jmp L(fraction_done) + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h b/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h new file mode 100644 index 0000000..ef29061 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h @@ -0,0 +1,218 @@ +/* Intel P6/mmx gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the + value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in + mpn/x86/p6/sqr_basecase.asm. */ + + +/* 800 MHz P6 model 8 */ +/* Generated by tuneup.c, 2017-02-03, gcc 4.8 */ + +#define MOD_1_1P_METHOD 2 +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 62 + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 22 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 254 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 /* WRONG value, see comment above */ +#define SQR_TOOM3_THRESHOLD 83 +#define SQR_TOOM4_THRESHOLD 196 +#define SQR_TOOM6_THRESHOLD 214 +#define SQR_TOOM8_THRESHOLD 381 + +#define MULMID_TOOM42_THRESHOLD 56 + +#define MULMOD_BNM1_THRESHOLD 16 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 167,10}, { 95, 9}, { 199,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \ + { 223,10}, { 447,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 703,10}, { 1407,11}, { 735,12}, { 383,11}, \ + { 831,12}, { 447,11}, { 959,10}, { 1919,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \ + { 3327,13}, { 1919,12}, { 3839,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 160 +#define MUL_FFT_THRESHOLD 7040 + +#define SQR_FFT_MODF_THRESHOLD 376 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 376, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 135,10}, { 79, 9}, { 167,10}, \ + { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \ + { 271,10}, { 143, 9}, { 287, 8}, { 575, 9}, \ + { 303, 8}, { 607,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 479,12}, { 127,11}, { 255,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 703,10}, \ + { 1407,11}, { 735,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 959,10}, { 1919,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \ + { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,11}, { 1727,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 161 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 177 +#define SQRLO_SQR_THRESHOLD 8937 + +#define DC_DIV_QR_THRESHOLD 80 +#define DC_DIVAPPR_Q_THRESHOLD 240 +#define DC_BDIV_QR_THRESHOLD 76 +#define DC_BDIV_Q_THRESHOLD 166 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 262 +#define INV_APPR_THRESHOLD 250 + +#define BINV_NEWTON_THRESHOLD 272 +#define REDC_1_TO_REDC_N_THRESHOLD 72 + +#define MU_DIV_QR_THRESHOLD 1499 +#define MU_DIVAPPR_Q_THRESHOLD 1470 +#define MUPI_DIV_QR_THRESHOLD 124 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1341 + +#define POWM_SEC_TABLE 1,16,96,416,1259 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 270 +#define SET_STR_PRECOMPUTE_THRESHOLD 1084 + +#define FAC_DSC_THRESHOLD 194 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 124 +#define HGCD_APPR_THRESHOLD 152 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 474 +#define GCDEXT_DC_THRESHOLD 321 +#define JACOBI_BASE_METHOD 1 diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm b/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm new file mode 100644 index 0000000..febd1c0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm @@ -0,0 +1,38 @@ +dnl Intel Pentium-II mpn_lshift -- mpn left shift. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The P55 code runs well on P-II/III, but could stand some minor tweaks +dnl at some stage probably. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86/pentium/mmx/lshift.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm b/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm new file mode 100644 index 0000000..fd340e4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm @@ -0,0 +1,39 @@ +dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. + +dnl Copyright 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb (approx) + + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm b/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm new file mode 100644 index 0000000..77aa190 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm @@ -0,0 +1,38 @@ +dnl Intel Pentium-II mpn_rshift -- mpn left shift. + +dnl Copyright 2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The P55 code runs well on P-II/III, but could stand some minor tweaks +dnl at some stage probably. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86/pentium/mmx/rshift.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm new file mode 100644 index 0000000..b88ab5d --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm @@ -0,0 +1,190 @@ +dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: 2.0 cycles/limb + +C TODO +C Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13 +C with the current carry handling scheme. + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3 +C into 2mod3, but at that point going into a separate carries total so we +C don't keep the carry flag live across the loop control. Avoiding decl +C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66. +C + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EBX, `PARAM_SIZE') +define(SAVE_ESI, `PARAM_SRC') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + + subl $2, %ecx C size-2 + movl (%edx), %eax C src[0] + ja L(three_or_more) + jb L(one) + + C size==2 + + movl 4(%edx), %ecx C src[1] + + movl %eax, %edx C src[0] + shrl $24, %eax C src[0] high + + andl $0xFFFFFF, %edx C src[0] low + + addl %edx, %eax + movl %ecx, %edx C src[1] + shrl $16, %ecx C src[1] high + + andl $0xFFFF, %edx + addl %ecx, %eax + + shll $8, %edx C src[1] low + + addl %edx, %eax +L(one): + ret + + +L(three_or_more): + C eax src[0], initial acc 0mod3 + C ebx + C ecx size-2 + C edx src + C esi + C edi + C ebp + + movl %ebx, SAVE_EBX + movl 4(%edx), %ebx C src[1], initial 1mod3 + subl $3, %ecx C size-5 + + movl %esi, SAVE_ESI + movl 8(%edx), %esi C src[2], initial 2mod3 + + pushl %edi FRAME_pushl() + movl $0, %edi C initial carries 0mod3 + jng L(done) C if size < 6 + + +L(top): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx counter, limbs + C edx src + C esi acc 2mod3 + C edi carrys into 0mod3 + C ebp + + addl 12(%edx), %eax + adcl 16(%edx), %ebx + adcl 20(%edx), %esi + leal 12(%edx), %edx + adcl $0, %edi + + subl $3, %ecx + jg L(top) C at least 3 more to process + + +L(done): + C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively + cmpl $-1, %ecx + jl L(done_0) C if -2, meaning 0 more limbs + + C 1 or 2 more limbs + movl $0, %ecx + je L(done_1) C if -1, meaning 1 more limb only + movl 16(%edx), %ecx +L(done_1): + addl 12(%edx), %eax C 0mod3 + adcl %ecx, %ebx C 1mod3 + adcl $0, %esi C 2mod3 + adcl $0, %edi C carries 0mod3 + +L(done_0): + C eax acc 0mod3 + C ebx acc 1mod3 + C ecx + C edx + C esi acc 2mod3 + C edi carries 0mod3 + C ebp + + movl %eax, %ecx C 0mod3 + shrl $24, %eax C 0mod3 high initial total + + andl $0xFFFFFF, %ecx C 0mod3 low + movl %edi, %edx C carries + shrl $24, %edi C carries high + + addl %ecx, %eax C add 0mod3 low + andl $0xFFFFFF, %edx C carries 0mod3 low + movl %ebx, %ecx C 1mod3 + + shrl $16, %ebx C 1mod3 high + addl %edi, %eax C add carries high + addl %edx, %eax C add carries 0mod3 low + + andl $0xFFFF, %ecx C 1mod3 low mask + addl %ebx, %eax C add 1mod3 high + movl SAVE_EBX, %ebx + + shll $8, %ecx C 1mod3 low + movl %esi, %edx C 2mod3 + popl %edi FRAME_popl() + + shrl $8, %esi C 2mod3 high + andl $0xFF, %edx C 2mod3 low mask + addl %ecx, %eax C add 1mod3 low + + shll $16, %edx C 2mod3 low + addl %esi, %eax C add 2mod3 high + movl SAVE_ESI, %esi + + addl %edx, %eax C add 2mod3 low + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/p6/mode1o.asm b/gmp-6.3.0/mpn/x86/p6/mode1o.asm new file mode 100644 index 0000000..7083195 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/mode1o.asm @@ -0,0 +1,170 @@ +dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: 10.0 cycles/limb + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C It's not worth skipping a step at the end when high32),1, +eval((UNROLL_COUNT-32)*4), +0)) + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size-1] + C ebp + + movl PARAM_SIZE, %ecx + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx + negl %ecx + + shll $4, %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + +ifelse(OFFSET,0,,`subl $OFFSET, %edi') + + C The calculated jump mustn't be before the start of the available + C code. This is the limit that UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + + ASSERT(ae, + `movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, second highest limb of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + + movl %edx, %ebx C high carry + leal 4(%edi), %edi + + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + movl %edx, VAR_JMP + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/ebx should start. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx carry high + C ecx carry low + C edx scratch + C esi src pointer + C edi dst pointer + C ebp multiplier + C + C 15 code bytes each limb + C ecx/ebx reversed on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ebx, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx +') +') +L(unroll_inner_end): + + addl %ebx, m4_empty_if_zero(OFFSET)(%edi) + + movl VAR_COUNTER, %edx + adcl $0, %ecx + + movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C eax + C ebx + C ecx + C edx + C esi &src[size] + C edi &dst[2*size-5] + C ebp + + movl -12(%esi), %eax + + mull -8(%esi) + + addl %eax, (%edi) + movl -12(%esi), %eax + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + addl %eax, %ebx + movl -8(%esi), %eax + + adcl $0, %edx + + addl %ebx, 4(%edi) + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + movl PARAM_SIZE, %ecx + addl %ebx, %eax + + adcl $0, %edx + + movl %eax, 8(%edi) + + movl %edx, 12(%edi) + movl PARAM_DST, %edi + + +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. + + subl $1, %ecx C size-1 + xorl %eax, %eax C ready for final adcl, and clear carry + + movl %ecx, %edx + movl PARAM_SRC, %esi + + +L(lshift): + C eax + C ebx + C ecx counter, size-1 to 1 + C edx size-1 (for later use) + C esi src (for later use) + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + + leal 8(%edi), %edi + decl %ecx + jnz L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%esi), %eax C src[0] + + leal 4(%esi,%edx,4), %esi C &src[size] + subl %edx, %ecx C -(size-1) + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi &src[size] + C edi dst[2*size-2] + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret_internal +') + + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm b/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm new file mode 100644 index 0000000..144b627 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm @@ -0,0 +1,37 @@ +dnl Intel P6/SSE2 mpn_addmul_1. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Write P6 specific SSE2 code. + +MULFUNC_PROLOGUE(mpn_addmul_1) +include_mpn(`x86/pentium4/sse2/addmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h b/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h new file mode 100644 index 0000000..a1e261b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h @@ -0,0 +1,200 @@ +/* Intel P6/sse2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the + value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in + mpn/x86/p6/sqr_basecase.asm. */ + + +/* 1867 MHz P6 model 13 */ + +#define MOD_1_NORM_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 77 +#define MUL_TOOM44_THRESHOLD 169 +#define MUL_TOOM6H_THRESHOLD 246 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 58 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 690 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \ + { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \ + { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \ + { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \ + { 415,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 132 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \ + { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \ + { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \ + { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \ + { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \ + { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \ + { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 146 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 31 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 100 +#define SQRLO_SQR_THRESHOLD 9236 + +#define DC_DIV_QR_THRESHOLD 25 +#define DC_DIVAPPR_Q_THRESHOLD 55 +#define DC_BDIV_QR_THRESHOLD 60 +#define DC_BDIV_Q_THRESHOLD 132 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 65 +#define INV_APPR_THRESHOLD 65 + +#define BINV_NEWTON_THRESHOLD 252 +#define REDC_1_TO_REDC_N_THRESHOLD 62 + +#define MU_DIV_QR_THRESHOLD 1164 +#define MU_DIVAPPR_Q_THRESHOLD 748 +#define MUPI_DIV_QR_THRESHOLD 38 +#define MU_BDIV_QR_THRESHOLD 1360 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 2,23,258,879,2246 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 582 +#define SET_STR_PRECOMPUTE_THRESHOLD 1118 + +#define FAC_DSC_THRESHOLD 178 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD_THRESHOLD 69 +#define HGCD_APPR_THRESHOLD 112 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 303 +#define JACOBI_BASE_METHOD 1 diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm b/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm new file mode 100644 index 0000000..8b7b7ad --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm @@ -0,0 +1,34 @@ +dnl Intel P6/SSE2 mpn_mod_1_1. + +dnl Copyright 2009, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mod_1_1p) +include_mpn(`x86/pentium4/sse2/mod_1_1.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm b/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm new file mode 100644 index 0000000..49c96c6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm @@ -0,0 +1,34 @@ +dnl Intel P6/SSE2 mpn_mod_1_4. + +dnl Copyright 2009, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mod_1s_4p) +include_mpn(`x86/pentium4/sse2/mod_1_4.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm b/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm new file mode 100644 index 0000000..50e5b69 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm @@ -0,0 +1,38 @@ +dnl Intel P6/SSE2 mpn_mul_1. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Write P6 specific SSE2 code. It should reach 3 c/l. +C The Pentium4 code runs at 4.2 c/l. + +MULFUNC_PROLOGUE(mpn_mul_1) +include_mpn(`x86/pentium4/sse2/mul_1.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm b/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm new file mode 100644 index 0000000..4687625 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm @@ -0,0 +1,35 @@ +dnl Intel P6/SSE2 mpn_mul_basecase. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86/pentium4/sse2/mul_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm b/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm new file mode 100644 index 0000000..4c02b93 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm @@ -0,0 +1,35 @@ +dnl Intel P6/SSE2 mpn_popcount -- population count. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm new file mode 100644 index 0000000..76b574b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm @@ -0,0 +1,35 @@ +dnl Intel P6/SSE2 mpn_sqr_basecase. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86/pentium4/sse2/sqr_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm b/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm new file mode 100644 index 0000000..69d940d --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm @@ -0,0 +1,35 @@ +dnl Intel P6/SSE2 mpn_submul_1. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +MULFUNC_PROLOGUE(mpn_submul_1) +include_mpn(`x86/pentium4/sse2/submul_1.asm') diff --git a/gmp-6.3.0/mpn/x86/pentium/README b/gmp-6.3.0/mpn/x86/pentium/README new file mode 100644 index 0000000..305936b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/README @@ -0,0 +1,181 @@ +Copyright 1996, 1999-2001, 2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + INTEL PENTIUM P5 MPN SUBROUTINES + + +This directory contains mpn functions optimized for Intel Pentium (P5,P54) +processors. The mmx subdirectory has additional code for Pentium with MMX +(P55). + + +STATUS + + cycles/limb + + mpn_add_n/sub_n 2.375 + + mpn_mul_1 12.0 + mpn_add/submul_1 14.0 + + mpn_mul_basecase 14.2 cycles/crossproduct (approx) + + mpn_sqr_basecase 8 cycles/crossproduct (approx) + or 15.5 cycles/triangleproduct (approx) + + mpn_l/rshift 5.375 normal (6.0 on P54) + 1.875 special shift by 1 bit + + mpn_divrem_1 44.0 + mpn_mod_1 28.0 + mpn_divexact_by3 15.0 + + mpn_copyi/copyd 1.0 + +Pentium MMX gets the following improvements + + mpn_l/rshift 1.75 + + mpn_mul_1 12.0 normal, 7.0 for 16-bit multiplier + + +mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop +overhead and other delays (cache refill?), they run at or near 2.5 +cycles/limb. + +mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they +should. Intel documentation says a mul instruction is 10 cycles, but it +measures 9 and the routines using it run as 9. + + + +P55 MMX AND X87 + +The cost of switching between MMX and x87 floating point on P55 is about 100 +cycles (fld1/por/emms for instance). In order to avoid that the two aren't +mixed and currently that means using MMX and not x87. + +MMX offers a big speedup for lshift and rshift, and a nice speedup for +16-bit multipliers in mpn_mul_1. If fast code using x87 is found then +perhaps the preference for MMX will be reversed. + + + + +P54 SHLDL + +mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the +documentation indicates that they should take only 43/8 = 5.375 cycles/limb, +or 5 cycles/limb asymptotically. The P55 runs them at the expected speed. + +It seems that on P54 a shldl or shrdl allows pairing in one following cycle, +but not two. For example, back to back repetitions of the following + + shldl( %cl, %eax, %ebx) + xorl %edx, %edx + xorl %esi, %esi + +run at 5 cycles, as expected, but repetitions of the following run at 7 +cycles, whereas 6 would be expected (and is achieved on P55), + + shldl( %cl, %eax, %ebx) + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + +Three xorls run at 7 cycles too, so it doesn't seem to be just that pairing +inhibited is only in the second following cycle (or something like that). + +Avoiding this problem would bring P54 shifts down from 6.0 c/l to 5.5 with a +pattern of shift, 2 loads, shift, 2 stores, shift, etc. A start has been +made on something like that, but it's not yet complete. + + + + +OTHER NOTES + +Prefetching Destinations + + Pentium doesn't allocate cache lines on writes, unlike most other modern + processors. Since the functions in the mpn class do array writes, we + have to handle allocating the destination cache lines by reading a word + from it in the loops, to achieve the best performance. + +Prefetching Sources + + Prefetching of sources is pointless since there's no out-of-order loads. + Any load instruction blocks until the line is brought to L1, so it may + as well be the load that wants the data which blocks. + +Data Cache Bank Clashes + + Pairing of memory operations requires that the two issued operations + refer to different cache banks (ie. different addresses modulo 32 + bytes). The simplest way to ensure this is to read/write two words from + the same object. If we make operations on different objects, they might + or might not be to the same cache bank. + +PIC %eip Fetching + + A simple call $+5 and popl can be used to get %eip, there's no need to + balance calls and returns since P5 doesn't have any return stack branch + prediction. + +Float Multiplies + + fmul is pairable and can be issued every 2 cycles (with a 4 cycle + latency for data ready to use). This is a lot better than integer mull + or imull at 9 cycles non-pairing. Unfortunately the advantage is + quickly eaten away by needing to throw data through memory back to the + integer registers to adjust for fild and fist being signed, and to do + things like propagating carry bits. + + + + + +REFERENCES + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is mostly about P5, the parts about P6 aren't relevant. Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/x86/pentium/aors_n.asm b/gmp-6.3.0/mpn/x86/pentium/aors_n.asm new file mode 100644 index 0000000..01ebfb9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/aors_n.asm @@ -0,0 +1,203 @@ +dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 2.375 cycles/limb + + +ifdef(`OPERATION_add_n',` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(M4_function_nc) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(endgo) + + pushl %edx +FRAME_pushl() + movl PARAM_CARRY,%eax + shrl %eax C shift bit 0 into carry + jmp L(oop) + +L(endgo): +deflit(`FRAME',16) + movl PARAM_CARRY,%eax + shrl %eax C shift bit 0 into carry + jmp L(end) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_n) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(end) + pushl %edx +FRAME_pushl() + + ALIGN(8) +L(oop): movl 28(%edi),%eax C fetch destination cache line + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + M4_inst %ebx,%edx + movl 8(%ebp),%ebx + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + M4_inst %ebx,%eax + movl 12(%ebp),%ebx + M4_inst %ebx,%edx + movl 16(%ebp),%ebx + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + M4_inst %ebx,%eax + movl 20(%ebp),%ebx + M4_inst %ebx,%edx + movl 24(%ebp),%ebx + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + M4_inst %ebx,%eax + movl 28(%ebp),%ebx + M4_inst %ebx,%edx + movl 32(%ebp),%ebx + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebp),%ebp + decl %ecx + jnz L(oop) + + popl %edx +FRAME_popl() +L(end): + decl %edx C test %edx w/o clobbering carry + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebp),%ebp + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + M4_inst %ebx,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm b/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm new file mode 100644 index 0000000..d83cc45 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm @@ -0,0 +1,144 @@ +dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication. + +dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 14.0 cycles/limb + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + + ALIGN(8) +PROLOGUE(M4_function_1c) +deflit(`FRAME',0) + + movl PARAM_CARRY, %ecx + pushl %esi FRAME_pushl() + + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_1) +deflit(`FRAME',0) + + xorl %ecx, %ecx + pushl %esi FRAME_pushl() + +L(start_1c): + movl PARAM_SRC, %esi + movl PARAM_SIZE, %eax + + pushl %edi FRAME_pushl() + pushl %ebx FRAME_pushl() + + movl PARAM_DST, %edi + leal -1(%eax), %ebx C size-1 + + leal (%esi,%eax,4), %esi + xorl $-1, %ebx C -size, and clear carry + + leal (%edi,%eax,4), %edi + +L(top): + C eax + C ebx counter, negative + C ecx carry + C edx + C esi src end + C edi dst end + C ebp + + adcl $0, %ecx + movl (%esi,%ebx,4), %eax + + mull PARAM_MULTIPLIER + + addl %ecx, %eax + movl (%edi,%ebx,4), %ecx + + adcl $0, %edx + M4_inst %eax, %ecx + + movl %ecx, (%edi,%ebx,4) + incl %ebx + + movl %edx, %ecx + jnz L(top) + + + adcl $0, %ecx + popl %ebx + + movl %ecx, %eax + popl %edi + + popl %esi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm new file mode 100644 index 0000000..c2c4f58 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm @@ -0,0 +1,266 @@ +dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. + +dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C divisor +C odd even +C P54: 24.5 30.5 cycles/limb +C P55: 23.0 28.0 + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) + +C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as +C expected. On P54 in the even case the shrdl pairing nonsense (see +C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a +C further 1.5 slowdown for both odd and even. + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_DST') + + TEXT + + ALIGN(32) +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl $-1, %ecx + movl PARAM_DIVISOR, %eax + +L(strip_twos): + ASSERT(nz, `orl %eax, %eax') + shrl %eax + incl %ecx C shift count + + jnc L(strip_twos) + + leal 1(%eax,%eax), %edx C d + andl $127, %eax C d/2, 7 bits + + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() + +ifdef(`PIC',` +ifdef(`DARWIN',` + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %eax +',` + call L(here) +L(here): + popl %ebp C eip + + addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp + C AGI + movl binvert_limb_table@GOT(%ebp), %ebp + C AGI + movzbl (%eax,%ebp), %eax +') +',` + +dnl non-PIC + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + movl %eax, %ebp C inv + addl %eax, %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + movl PARAM_SIZE, %ebx + + movl %eax, %ebp + addl %eax, %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + movl %edx, PARAM_DIVISOR C d without twos + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + jmp L(common) +EPILOGUE() + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + ALIGN(32) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_SHIFT, %ecx + + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() + + movl PARAM_SIZE, %ebx + movl PARAM_INVERSE, %eax + +L(common): + pushl %esi FRAME_pushl() + push %edi FRAME_pushl() + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + movl %eax, VAR_INVERSE + + leal (%esi,%ebx,4), %esi C src end + leal (%edi,%ebx,4), %edi C dst end + + negl %ebx C -size + + xorl %ebp, %ebp C initial carry bit + + orl %ecx, %ecx C shift + movl (%esi,%ebx,4), %eax C src low limb + jz L(odd_entry) + + xorl %edx, %edx C initial carry limb (for even, if one) + incl %ebx + jz L(one) + + movl (%esi,%ebx,4), %edx C src second limb (for even) + shrdl( %cl, %edx, %eax) + + jmp L(even_entry) + + + ALIGN(8) +L(odd_top): + C eax scratch + C ebx counter, limbs, negative + C ecx + C edx + C esi src end + C edi dst end + C ebp carry bit, 0 or -1 + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ebp, %edx + + subl %edx, %eax + + sbbl %ebp, %ebp + +L(odd_entry): + imull VAR_INVERSE, %eax + + movl %eax, (%edi,%ebx,4) + + incl %ebx + jnz L(odd_top) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx twos + C edx + C esi src end + C edi dst end + C ebp carry bit, 0 or -1 + + mull PARAM_DIVISOR + + subl %ebp, %edx C carry bit + movl -4(%esi,%ebx,4), %eax C src limb + + movl (%esi,%ebx,4), %ebp C and one above it + + shrdl( %cl, %ebp, %eax) + + subl %edx, %eax C carry limb + + sbbl %ebp, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + + jnz L(even_top) + + mull PARAM_DIVISOR + + movl -4(%esi), %eax C src high limb + subl %ebp, %edx + +L(one): + shrl %cl, %eax + + subl %edx, %eax C no carry if division is exact + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) C dst high limb + nop C protect against cache bank clash + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium/com.asm b/gmp-6.3.0/mpn/x86/pentium/com.asm new file mode 100644 index 0000000..b080545 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/com.asm @@ -0,0 +1,181 @@ +dnl Intel Pentium mpn_com -- mpn ones complement. + +dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.75 cycles/limb + + +NAILS_SUPPORT(0-31) + + +C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C This code is similar to mpn_copyi, basically there's just some "xorl +C $GMP_NUMB_MASK"s inserted. +C +C Alternatives: +C +C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst +C are the same alignment mod 8, but it doesn't seem worth the trouble for +C just that case (there'd need to be some plain integer available too for +C the unaligned case). + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_com) +deflit(`FRAME',0) + + movl PARAM_SRC, %eax + movl PARAM_SIZE, %ecx + + pushl %esi FRAME_pushl() + pushl %edi FRAME_pushl() + + leal (%eax,%ecx,4), %eax + xorl $-1, %ecx C -size-1 + + movl PARAM_DST, %edx + addl $8, %ecx C -size+7 + + jns L(end) + + movl (%edx), %esi C fetch destination cache line + nop + +L(top): + C eax &src[size] + C ebx + C ecx counter, limbs, negative + C edx dst, incrementing + C esi scratch + C edi scratch + C ebp + + movl 28(%edx), %esi C destination prefetch + addl $32, %edx + + movl -28(%eax,%ecx,4), %esi + movl -24(%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, -32(%edx) + movl %edi, -28(%edx) + + movl -20(%eax,%ecx,4), %esi + movl -16(%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, -24(%edx) + movl %edi, -20(%edx) + + movl -12(%eax,%ecx,4), %esi + movl -8(%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, -16(%edx) + movl %edi, -12(%edx) + + movl -4(%eax,%ecx,4), %esi + movl (%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, -8(%edx) + movl %edi, -4(%edx) + + addl $8, %ecx + js L(top) + + +L(end): + C eax &src[size] + C ecx 0 to 7, representing respectively 7 to 0 limbs remaining + C edx dst, next location to store + + subl $4, %ecx + nop + + jns L(no4) + + movl -12(%eax,%ecx,4), %esi + movl -8(%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, (%edx) + movl %edi, 4(%edx) + + movl -4(%eax,%ecx,4), %esi + movl (%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, 8(%edx) + movl %edi, 12(%edx) + + addl $16, %edx + addl $4, %ecx +L(no4): + + subl $2, %ecx + nop + + jns L(no2) + + movl -4(%eax,%ecx,4), %esi + movl (%eax,%ecx,4), %edi + xorl $GMP_NUMB_MASK, %esi + xorl $GMP_NUMB_MASK, %edi + movl %esi, (%edx) + movl %edi, 4(%edx) + + addl $8, %edx + addl $2, %ecx +L(no2): + + popl %edi + jnz L(done) + + movl -4(%eax), %ecx + + xorl $GMP_NUMB_MASK, %ecx + popl %esi + + movl %ecx, (%edx) + ret + +L(done): + popl %esi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/copyd.asm b/gmp-6.3.0/mpn/x86/pentium/copyd.asm new file mode 100644 index 0000000..72a543b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/copyd.asm @@ -0,0 +1,146 @@ +dnl Intel Pentium mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.25 cycles/limb + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C See comments in copyi.asm. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + + movl PARAM_SRC, %eax + movl PARAM_SIZE, %ecx + + pushl %esi FRAME_pushl() + pushl %edi FRAME_pushl() + + leal -4(%eax,%ecx,4), %eax C &src[size-1] + movl PARAM_DST, %edx + + subl $7, %ecx C size-7 + jle L(end) + + movl 28-4(%edx,%ecx,4), %esi C prefetch cache, dst[size-1] + nop + +L(top): + C eax src, decrementing + C ebx + C ecx counter, limbs + C edx dst + C esi scratch + C edi scratch + C ebp + + movl 28-32(%edx,%ecx,4), %esi C prefetch dst cache line + subl $8, %ecx + + movl (%eax), %esi C read words pairwise + movl -4(%eax), %edi + movl %esi, 56(%edx,%ecx,4) C store words pairwise + movl %edi, 52(%edx,%ecx,4) + + movl -8(%eax), %esi + movl -12(%eax), %edi + movl %esi, 48(%edx,%ecx,4) + movl %edi, 44(%edx,%ecx,4) + + movl -16(%eax), %esi + movl -20(%eax), %edi + movl %esi, 40(%edx,%ecx,4) + movl %edi, 36(%edx,%ecx,4) + + movl -24(%eax), %esi + movl -28(%eax), %edi + movl %esi, 32(%edx,%ecx,4) + movl %edi, 28(%edx,%ecx,4) + + leal -32(%eax), %eax + jg L(top) + + +L(end): + C ecx -7 to 0, representing respectively 0 to 7 limbs remaining + C eax src end + C edx dst, next location to store + + addl $4, %ecx + jle L(no4) + + movl (%eax), %esi + movl -4(%eax), %edi + movl %esi, 8(%edx,%ecx,4) + movl %edi, 4(%edx,%ecx,4) + + movl -8(%eax), %esi + movl -12(%eax), %edi + movl %esi, (%edx,%ecx,4) + movl %edi, -4(%edx,%ecx,4) + + subl $16, %eax + subl $4, %ecx +L(no4): + + addl $2, %ecx + jle L(no2) + + movl (%eax), %esi + movl -4(%eax), %edi + movl %esi, (%edx,%ecx,4) + movl %edi, -4(%edx,%ecx,4) + + subl $8, %eax + subl $2, %ecx +L(no2): + + jnz L(done) + + movl (%eax), %ecx + movl %ecx, (%edx) C risk of cache bank clash here + +L(done): + popl %edi + popl %esi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/copyi.asm b/gmp-6.3.0/mpn/x86/pentium/copyi.asm new file mode 100644 index 0000000..d983d6b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/copyi.asm @@ -0,0 +1,164 @@ +dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.25 cycles/limb + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Destination prefetching is done to avoid repeated write-throughs on lines +C not already in L1. +C +C At least one of the src or dst pointer needs to be incremented rather than +C using indexing, so that there's somewhere to put the loop control without +C an AGI. Incrementing one and not two lets us keep loop overhead to 2 +C cycles. Making it the src pointer incremented avoids an AGI on the %ecx +C subtracts in the finishup code. +C +C The block of finishup code is almost as big as the main loop itself, which +C is unfortunate, but it's faster that way than with say rep movsl, by about +C 10 cycles for instance on P55. +C +C There's nothing to be gained from MMX on P55, since it can do only one +C movq load (or store) per cycle, so the throughput would be the same as the +C code here (and even then only if src and dst have the same alignment mod +C 8). + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_DST, %edx + + pushl %ebx FRAME_pushl() + pushl %esi FRAME_pushl() + + leal (%edx,%ecx,4), %edx C &dst[size-1] + xorl $-1, %ecx C -size-1 + + movl PARAM_SRC, %esi + addl $8, %ecx C -size+7 + + jns L(end) + + movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0] + nop + +L(top): + C eax scratch + C ebx scratch + C ecx counter, limbs, negative + C edx &dst[size-1] + C esi src, incrementing + C edi + C ebp + + movl (%edx,%ecx,4), %eax C fetch destination cache line + addl $8, %ecx + + movl (%esi), %eax C read words pairwise + movl 4(%esi), %ebx + movl %eax, -60(%edx,%ecx,4) C store words pairwise + movl %ebx, -56(%edx,%ecx,4) + + movl 8(%esi), %eax + movl 12(%esi), %ebx + movl %eax, -52(%edx,%ecx,4) + movl %ebx, -48(%edx,%ecx,4) + + movl 16(%esi), %eax + movl 20(%esi), %ebx + movl %eax, -44(%edx,%ecx,4) + movl %ebx, -40(%edx,%ecx,4) + + movl 24(%esi), %eax + movl 28(%esi), %ebx + movl %eax, -36(%edx,%ecx,4) + movl %ebx, -32(%edx,%ecx,4) + + leal 32(%esi), %esi + js L(top) + + +L(end): + C ecx 0 to 7, representing respectively 7 to 0 limbs remaining + C esi src end + C edx dst, next location to store + + subl $4, %ecx + jns L(no4) + + movl (%esi), %eax + movl 4(%esi), %ebx + movl %eax, -12(%edx,%ecx,4) + movl %ebx, -8(%edx,%ecx,4) + + movl 8(%esi), %eax + movl 12(%esi), %ebx + movl %eax, -4(%edx,%ecx,4) + movl %ebx, (%edx,%ecx,4) + + addl $16, %esi + addl $4, %ecx +L(no4): + + subl $2, %ecx + jns L(no2) + + movl (%esi), %eax + movl 4(%esi), %ebx + movl %eax, -4(%edx,%ecx,4) + movl %ebx, (%edx,%ecx,4) + + addl $8, %esi + addl $2, %ecx +L(no2): + + jnz L(done) + + movl (%esi), %eax + movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here + +L(done): + popl %esi + popl %ebx + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/dive_1.asm b/gmp-6.3.0/mpn/x86/pentium/dive_1.asm new file mode 100644 index 0000000..21b5287 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/dive_1.asm @@ -0,0 +1,264 @@ +dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C divisor +C odd even +C P54: 24.5 30.5 cycles/limb +C P55: 23.0 28.0 + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C Plain divl is used for small sizes, since the inverse takes a while to +C setup. Multiplying works out faster for size>=3 when the divisor is odd, +C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or +C size==3 for even are about the same speed for both divl or mul, but the +C former is used since it will use up less code cache. +C +C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as +C expected. On P54 in the even case the shrdl pairing nonsense (see +C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a +C further 1.5 slowdown for both odd and even. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_DST') + + TEXT + + ALIGN(32) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + movl PARAM_SIZE, %ecx + + pushl %esi FRAME_pushl() + push %edi FRAME_pushl() + + movl PARAM_SRC, %esi + andl $1, %eax + + movl PARAM_DST, %edi + addl %ecx, %eax C size if even, size+1 if odd + + cmpl $4, %eax + jae L(mul_by_inverse) + + + xorl %edx, %edx +L(div_top): + movl -4(%esi,%ecx,4), %eax + + divl PARAM_DIVISOR + + movl %eax, -4(%edi,%ecx,4) + decl %ecx + + jnz L(div_top) + + popl %edi + popl %esi + + ret + + + +L(mul_by_inverse): + movl PARAM_DIVISOR, %eax + movl $-1, %ecx + +L(strip_twos): + ASSERT(nz, `orl %eax, %eax') + shrl %eax + incl %ecx C shift count + + jnc L(strip_twos) + + leal 1(%eax,%eax), %edx C d + andl $127, %eax C d/2, 7 bits + + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() + +ifdef(`PIC',`dnl + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + movl %eax, %ebp C inv + addl %eax, %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + movl PARAM_SIZE, %ebx + + movl %eax, %ebp + addl %eax, %eax C 2*inv + + imull %ebp, %ebp C inv*inv + + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + movl %edx, PARAM_DIVISOR C d without twos + + leal (%esi,%ebx,4), %esi C src end + leal (%edi,%ebx,4), %edi C dst end + + negl %ebx C -size + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + imull PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + movl %eax, VAR_INVERSE + xorl %ebp, %ebp C initial carry bit + + movl (%esi,%ebx,4), %eax C src low limb + orl %ecx, %ecx C shift + + movl 4(%esi,%ebx,4), %edx C src second limb (for even) + jz L(odd_entry) + + shrdl( %cl, %edx, %eax) + + incl %ebx + jmp L(even_entry) + + + ALIGN(8) +L(odd_top): + C eax scratch + C ebx counter, limbs, negative + C ecx + C edx + C esi src end + C edi dst end + C ebp carry bit, 0 or -1 + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ebp, %edx + + subl %edx, %eax + + sbbl %ebp, %ebp + +L(odd_entry): + imull VAR_INVERSE, %eax + + movl %eax, (%edi,%ebx,4) + + incl %ebx + jnz L(odd_top) + + + popl %ebp + popl %ebx + + popl %edi + popl %esi + + ret + + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx twos + C edx + C esi src end + C edi dst end + C ebp carry bit, 0 or -1 + + mull PARAM_DIVISOR + + subl %ebp, %edx C carry bit + movl -4(%esi,%ebx,4), %eax C src limb + + movl (%esi,%ebx,4), %ebp C and one above it + + shrdl( %cl, %ebp, %eax) + + subl %edx, %eax C carry limb + + sbbl %ebp, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + + jnz L(even_top) + + + + mull PARAM_DIVISOR + + movl -4(%esi), %eax C src high limb + subl %ebp, %edx + + shrl %cl, %eax + + subl %edx, %eax C no carry if division is exact + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) C dst high limb + nop C protect against cache bank clash + + popl %ebp + popl %ebx + + popl %edi + popl %esi + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h b/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h new file mode 100644 index 0000000..befa6e2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h @@ -0,0 +1,76 @@ +/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* For mpn/x86/pentium/mod_1.asm */ +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB + + +/* 166MHz P54 */ + +/* Generated by tuneup.c, 2004-02-10, gcc 2.95 */ + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 90 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 122 + +#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_DC_THRESHOLD 52 +#define POWM_THRESHOLD 77 + +#define HGCD_THRESHOLD 121 +#define GCD_ACCEL_THRESHOLD 3 +#define GCD_DC_THRESHOLD 615 +#define JACOBI_BASE_METHOD 2 + +#define USE_PREINV_DIVREM_1 0 +#define USE_PREINV_MOD_1 1 /* native */ +#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */ + +#define GET_STR_DC_THRESHOLD 23 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_THRESHOLD 2788 + +#define MUL_FFT_TABLE { 432, 928, 1664, 3584, 10240, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 448 +#define MUL_FFT_THRESHOLD 3328 + +#define SQR_FFT_TABLE { 496, 928, 1920, 4608, 10240, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 512 +#define SQR_FFT_THRESHOLD 3328 diff --git a/gmp-6.3.0/mpn/x86/pentium/hamdist.asm b/gmp-6.3.0/mpn/x86/pentium/hamdist.asm new file mode 100644 index 0000000..6c6c1a1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/hamdist.asm @@ -0,0 +1,154 @@ +dnl Intel P5 mpn_hamdist -- mpn hamming distance. + +dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 14.0 cycles/limb + + +C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size); +C +C It might be possible to shave 1 cycle from the loop, and hence 2 +C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor +C would be 1, if the right schedule could be found (not found so far). +C Wanting to avoid potential cache bank clashes makes it tricky. + +C The slightly strange quoting here helps the renaming done by tune/many.pl. +deflit(TABLE_NAME, +m4_assert_defined(`GSYM_PREFIX') +GSYM_PREFIX`'mpn_popcount``'_table') + +C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental +C linking. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC1, 4) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_hamdist) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %esi FRAME_pushl() + + shll %ecx C size in byte pairs + pushl %edi FRAME_pushl() + +ifdef(`PIC',` + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() +ifdef(`DARWIN',` + movl PARAM_SRC1, %esi + movl PARAM_SRC2, %edi + LEA( TABLE_NAME, %ebp) + xorl %ebx, %ebx C byte + xorl %edx, %edx C byte + xorl %eax, %eax C total +',` + call L(here) FRAME_pushl() +L(here): + movl PARAM_SRC1, %esi + popl %ebp FRAME_popl() + + movl PARAM_SRC2, %edi + addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp + + xorl %ebx, %ebx C byte + xorl %edx, %edx C byte + + movl TABLE_NAME@GOT(%ebp), %ebp + xorl %eax, %eax C total +') +define(TABLE,`(%ebp,$1)') +',` +dnl non-PIC + movl PARAM_SRC1, %esi + movl PARAM_SRC2, %edi + + xorl %eax, %eax C total + pushl %ebx FRAME_pushl() + + xorl %edx, %edx C byte + xorl %ebx, %ebx C byte + +define(TABLE,`TABLE_NAME($1)') +') + + + C The nop after the xorb seems necessary. Although a movb might be + C expected to go down the V pipe in the second cycle of the xorb, it + C doesn't and costs an extra 2 cycles. +L(top): + C eax total + C ebx byte + C ecx counter, 2*size to 2 + C edx byte + C esi src1 + C edi src2 + C ebp [PIC] table + + addl %ebx, %eax + movb -1(%esi,%ecx,2), %bl + + addl %edx, %eax + movb -1(%edi,%ecx,2), %dl + + xorb %dl, %bl + movb -2(%esi,%ecx,2), %dl + + xorb -2(%edi,%ecx,2), %dl + nop + + movb TABLE(%ebx), %bl + decl %ecx + + movb TABLE(%edx), %dl + jnz L(top) + + +ifdef(`PIC',` + popl %ebp +') + addl %ebx, %eax + popl %ebx + + addl %edx, %eax + popl %edi + + popl %esi + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium/logops_n.asm b/gmp-6.3.0/mpn/x86/pentium/logops_n.asm new file mode 100644 index 0000000..1877317 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/logops_n.asm @@ -0,0 +1,176 @@ +dnl Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 3.0 c/l and, ior, xor +C 3.5 c/l andn, iorn, nand, nior, xnor + + +define(M4_choose_op, +`ifdef(`OPERATION_$1',` +define(`M4_function', `mpn_$1') +define(`M4_want_pre', `$4') +define(`M4op', `$3') +define(`M4_want_post',`$2') +')') +define(M4pre, `ifelse(M4_want_pre, yes,`$1')') +define(M4post,`ifelse(M4_want_post,yes,`$1')') + +M4_choose_op( and_n, , andl, ) +M4_choose_op( andn_n, , andl, yes) +M4_choose_op( nand_n, yes, andl, ) +M4_choose_op( ior_n, , orl, ) +M4_choose_op( iorn_n, , orl, yes) +M4_choose_op( nior_n, yes, orl, ) +M4_choose_op( xor_n, , xorl, ) +M4_choose_op( xnor_n, yes, xorl, ) + +ifdef(`M4_function',, +`m4_error(`Unrecognised or undefined OPERATION symbol +')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +NAILS_SUPPORT(0-31) + + +C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size); +C +C Nothing complicated here, just some care to avoid data cache bank clashes +C and AGIs. +C +C We're one register short of being able to do a simple 4 loads, 2 ops, 2 +C stores. Instead %ebp is juggled a bit and nops are introduced to keep the +C pairings as intended. An in-place operation would free up a register, for +C an 0.5 c/l speedup, if that's worth bothering with. +C +C This code seems best for P55 too. Data alignment is a big problem for MMX +C and the pairing restrictions on movq and integer instructions make life +C difficult. + +defframe(PARAM_SIZE,16) +defframe(PARAM_YP, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + TEXT + ALIGN(8) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + pushl %ebx FRAME_pushl() + pushl %esi FRAME_pushl() + + pushl %edi FRAME_pushl() + pushl %ebp FRAME_pushl() + + movl PARAM_SIZE, %ecx + movl PARAM_XP, %ebx + + movl PARAM_YP, %esi + movl PARAM_WP, %edi + + shrl %ecx + jnc L(entry) + + movl (%ebx,%ecx,8), %eax C risk of data cache bank clash here + movl (%esi,%ecx,8), %edx + +M4pre(` notl_or_xorl_GMP_NUMB_MASK(%edx)') + + M4op %edx, %eax + +M4post(`xorl $GMP_NUMB_MASK, %eax') + orl %ecx, %ecx + + movl %eax, (%edi,%ecx,8) + jz L(done) + + jmp L(entry) + + +L(top): + C eax + C ebx xp + C ecx counter, limb pairs, decrementing + C edx + C esi yp + C edi wp + C ebp + + M4op %ebp, %edx + nop + +M4post(`xorl $GMP_NUMB_MASK, %eax') +M4post(`xorl $GMP_NUMB_MASK, %edx') + + movl %eax, 4(%edi,%ecx,8) + movl %edx, (%edi,%ecx,8) + +L(entry): + movl -4(%ebx,%ecx,8), %ebp + nop + + movl -4(%esi,%ecx,8), %eax + movl -8(%esi,%ecx,8), %edx + +M4pre(` xorl $GMP_NUMB_MASK, %eax') +M4pre(` xorl $GMP_NUMB_MASK, %edx') + + M4op %ebp, %eax + movl -8(%ebx,%ecx,8), %ebp + + decl %ecx + jnz L(top) + + + M4op %ebp, %edx + nop + +M4post(`xorl $GMP_NUMB_MASK, %eax') +M4post(`xorl $GMP_NUMB_MASK, %edx') + + movl %eax, 4(%edi,%ecx,8) + movl %edx, (%edi,%ecx,8) + + +L(done): + popl %ebp + popl %edi + + popl %esi + popl %ebx + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/lshift.asm b/gmp-6.3.0/mpn/x86/pentium/lshift.asm new file mode 100644 index 0000000..2a31f36 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/lshift.asm @@ -0,0 +1,243 @@ +dnl Intel Pentium mpn_lshift -- mpn left shift. + +dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5,P54: 6.0 +C P55: 5.375 + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_lshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%esi),%eax + cmpl %edi,%eax + jnc L(special) C jump if s_ptr + 1 >= res_ptr + leal (%esi,%ebp,4),%eax + cmpl %eax,%edi + jnc L(special) C jump if res_ptr >= s_ptr + size + +L(normal): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + shldl( %cl, %ebx, %edx) + shldl( %cl, %eax, %ebx) + movl %edx,-8(%edi) + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + shldl( %cl, %edx, %eax) + shldl( %cl, %ebx, %edx) + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shldl( %cl,%eax,%edx) + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shll %cl,%edx C compute least significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + movl (%esi),%edx + addl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + addl %edx,%edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebx,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + adcl %ebx,%ebx + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebx,%ebx + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebx,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi C use leal not to clobber carry + leal 32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + adcl %edx,%edx + movl %ebx,(%edi) + + leal 4(%esi),%esi C use leal not to clobber carry + leal 4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h b/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h new file mode 100644 index 0000000..02a0def --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h @@ -0,0 +1,163 @@ +/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + + +/* For mpn/x86/pentium/mod_1.asm */ +#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB + + +/* 233MHz P55 */ + +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1U_TO_MOD_1_1_THRESHOLD 12 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 11 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 63 +#define USE_PREINV_DIVREM_1 0 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 51 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 128 +#define MUL_TOOM6H_THRESHOLD 189 +#define MUL_TOOM8H_THRESHOLD 260 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 90 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 210 +#define SQR_TOOM8_THRESHOLD 375 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define MUL_FFT_MODF_THRESHOLD 364 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 364, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159, 8}, { 319, 9}, { 167,10}, \ + { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,11}, \ + { 351,12}, { 191,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 575,12}, { 319,11}, { 703,12}, \ + { 383,11}, { 831,12}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 90 +#define MUL_FFT_THRESHOLD 3520 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 65, 8}, { 43, 9}, \ + { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \ + { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \ + { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 351,11}, { 191,10}, { 415,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 351,12}, { 191,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 575,12}, \ + { 319,11}, { 703,12}, { 383,11}, { 767,12}, \ + { 447,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 96 +#define SQR_FFT_THRESHOLD 5504 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 48 +#define MULLO_MUL_N_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 170 +#define DC_BDIV_QR_THRESHOLD 43 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 194 +#define REDC_1_TO_REDC_N_THRESHOLD 50 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 90 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1017 + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD_THRESHOLD 92 +#define GCD_DC_THRESHOLD 283 +#define GCDEXT_DC_THRESHOLD 221 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 18 +#define GET_STR_PRECOMPUTE_THRESHOLD 31 +#define SET_STR_DC_THRESHOLD 490 +#define SET_STR_PRECOMPUTE_THRESHOLD 994 diff --git a/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm b/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm new file mode 100644 index 0000000..72e3196 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm @@ -0,0 +1,40 @@ +dnl Intel P55 mpn_hamdist -- mpn hamming distance. + +dnl Copyright 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P55: hamdist 12.0 cycles/limb + +C For reference, this code runs at 11.5 cycles/limb for popcount, which is +C slower than the plain integer mpn/x86/pentium/popcount.asm. + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm b/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm new file mode 100644 index 0000000..04b0ddc --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm @@ -0,0 +1,463 @@ +dnl Intel P5 mpn_lshift -- mpn left shift. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 1.75 cycles/limb. + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. Return the bits shifted out at the +C left. +C +C The comments in mpn_rshift apply here too. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl minimum 5, because the unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 5) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_lshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + movl -4(%ebx,%eax,4), %edi C src high limb + decl %eax + + jnz L(simple) + + shldl( %cl, %edi, %eax) C eax was decremented to zero + + shll %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx,%eax,4), %mm5 C src high limb + + movd %ecx, %mm6 C lshift + negl %ecx + + psllq %mm6, %mm5 + addl $32, %ecx + + movd %ecx, %mm7 + psrlq $32, %mm5 C retval + + +L(simple_top): + C eax counter, limbs, negative + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 scratch + C mm5 return value + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + C + + movd %mm0, 4(%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + + movd %mm5, %eax + psllq %mm6, %mm0 + + popl %edi + popl %ebx + + movd %mm0, (%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd -4(%ebx,%eax,4), %mm5 C src high limb + leal (%ebx,%eax,4), %edi + + movd %ecx, %mm6 C lshift + andl $4, %edi + + psllq %mm6, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process high limb separately (marked xxx) to + C make it so. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-------+-- + C | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + movq -8(%ebx,%eax,4), %mm0 C unaligned load + + psllq %mm6, %mm0 + decl %eax + + psrlq $32, %mm0 + + C + + movd %mm0, (%edx,%eax,4) +L(start_src_aligned): + + movq -8(%ebx,%eax,4), %mm1 C src high qword + leal (%edx,%eax,4), %edi + + andl $4, %edi + psrlq $32, %mm5 C return value + + movq -16(%ebx,%eax,4), %mm3 C src second highest qword + jz L(start_dst_aligned) + + C dst isn't aligned, subtract 4 to make it so, and pretend the shift + C is 32 bits extra. High limb of dst (marked xxx) handled here + C separately. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psllq %mm6, %mm0 + + movd %ecx, %mm6 + psrlq $32, %mm0 + + C wasted cycle here waiting for %mm0 + + movd %mm0, -4(%edx,%eax,4) + subl $4, %edx +L(start_dst_aligned): + + + psllq %mm6, %mm1 + negl %ecx C -shift + + addl $64, %ecx C 64-shift + movq %mm3, %mm2 + + movd %ecx, %mm7 + subl $8, %eax C size-8 + + psrlq %mm7, %mm3 + + por %mm1, %mm3 C mm3 ready to store + jc L(finish) + + + C The comments in mpn_rshift apply here too. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from 16(%ebx,%eax,4) + C mm3 dst qword ready to store to 24(%edx,%eax,4) + C + C mm5 return value + C mm6 lshift + C mm7 rshift + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq (%ebx,%eax,4), %mm3 C + psllq %mm6, %mm1 C + + movq %mm0, 16(%edx,%eax,4) + movq %mm3, %mm2 C + + psrlq %mm7, %mm3 C + subl $4, %eax + + por %mm1, %mm3 C + jnc L(unroll_loop) + + + +L(finish): + C eax -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %al + + jz L(finish_no_two) + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + subl $2, %eax +L(finish_no_two): + + + C eax -4 or -3 representing respectively 0 or 1 limbs remaining + C + C mm2 src prev qword, from 16(%ebx,%eax,4) + C mm3 dst qword, for 24(%edx,%eax,4) + + testb $1, %al + movd %mm5, %eax C retval + + popl %edi + jz L(finish_zero) + + + C One extra src limb, destination was aligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 edx + C --+---------------+---------------+-------+ + C | mm3 | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra src limb, destination was unaligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 4(%edx), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + + movd (%ebx), %mm0 + psllq %mm6, %mm2 + + movq %mm3, 12(%edx) + psllq $32, %mm0 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + psllq %mm6, %mm1 + + movq %mm0, 4(%edx) + psrlq $32, %mm1 + + andl $32, %ecx + popl %ebx + + jz L(finish_one_unaligned) + + movd %mm1, (%edx) +L(finish_one_unaligned): + + emms + + ret + + +L(finish_zero): + + C No extra src limbs, destination was aligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra src limbs, destination was unaligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx+4 + C --+---------------+-------+ + C | mm3 | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movd for the unaligned case writes the same data to 4(%edx) + C that the movq does for the aligned case. + + + movq %mm3, 8(%edx) + andl $32, %ecx + + psllq %mm6, %mm2 + jz L(finish_zero_unaligned) + + movq %mm2, (%edx) +L(finish_zero_unaligned): + + psrlq $32, %mm2 + popl %ebx + + movd %mm5, %eax C retval + + movd %mm2, 4(%edx) + + emms + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm b/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm new file mode 100644 index 0000000..4ced577 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm @@ -0,0 +1,371 @@ +dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication. + +dnl Copyright 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5: 12.0 for 32-bit multiplier +C 7.0 for 16-bit multiplier + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C +C When the multiplier is 16 bits some special case MMX code is used. Small +C multipliers might arise reasonably often from mpz_mul_ui etc. If the size +C is odd there's roughly a 5 cycle penalty, so times for say size==7 and +C size==8 end up being quite close. If src isn't aligned to an 8 byte +C boundary then one limb is processed separately with roughly a 5 cycle +C penalty, so in that case it's say size==8 and size==9 which are close. +C +C Alternatives: +C +C MMX is not believed to be of any use for 32-bit multipliers, since for +C instance the current method would just have to be more or less duplicated +C for the high and low halves of the multiplier, and would probably +C therefore run at about 14 cycles, which is slower than the plain integer +C at 12. +C +C Adding the high and low MMX products using integer code seems best. An +C attempt at using paddd and carry bit propagation with pcmpgtd didn't give +C any joy. Perhaps something could be done keeping the values signed and +C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or +C perhaps not. +C +C Future: +C +C An mpn_mul_1c entrypoint would need a double carry out of the low result +C limb in the 16-bit code, unless it could be assumed the carry fits in 16 +C bits, possibly as carry=a + + divl PARAM_DIVISOR + + movl %edx, %eax + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium/mul_1.asm b/gmp-6.3.0/mpn/x86/pentium/mul_1.asm new file mode 100644 index 0000000..a0858af --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/mul_1.asm @@ -0,0 +1,177 @@ +dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication. + +dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 12.0 cycles/limb + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier, mp_limb_t carry); +C + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_mul_1c) +deflit(`FRAME',0) + + movl PARAM_CARRY, %ecx + pushl %esi FRAME_pushl() + + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(mpn_mul_1) +deflit(`FRAME',0) + + xorl %ecx, %ecx + pushl %esi FRAME_pushl() + +L(start_1c): + movl PARAM_SRC, %esi + movl PARAM_SIZE, %eax + + shrl %eax + jnz L(two_or_more) + + + C one limb only + + movl (%esi), %eax + + mull PARAM_MULTIPLIER + + addl %eax, %ecx + movl PARAM_DST, %eax + + adcl $0, %edx + popl %esi + + movl %ecx, (%eax) + movl %edx, %eax + + ret + + +L(two_or_more): + C eax size/2 + C ebx + C ecx carry + C edx + C esi src + C edi + C ebp + + pushl %edi FRAME_pushl() + pushl %ebx FRAME_pushl() + + movl PARAM_DST, %edi + leal -1(%eax), %ebx C size/2-1 + + notl %ebx C -size, preserve carry + + leal (%esi,%eax,8), %esi C src end + leal (%edi,%eax,8), %edi C dst end + + pushl %ebp FRAME_pushl() + jnc L(top) + + + C size was odd, process one limb separately + + movl (%esi,%ebx,8), %eax + addl $4, %esi + + mull PARAM_MULTIPLIER + + addl %ecx, %eax + movl %edx, %ecx + + movl %eax, (%edi,%ebx,8) + leal 4(%edi), %edi + + +L(top): + C eax + C ebx counter, negative + C ecx carry + C edx + C esi src end + C edi dst end + C ebp + + adcl $0, %ecx + movl (%esi,%ebx,8), %eax + + mull PARAM_MULTIPLIER + + movl %edx, %ebp + addl %eax, %ecx + + adcl $0, %ebp + movl 4(%esi,%ebx,8), %eax + + mull PARAM_MULTIPLIER + + movl %ecx, (%edi,%ebx,8) + addl %ebp, %eax + + movl %eax, 4(%edi,%ebx,8) + incl %ebx + + movl %edx, %ecx + jnz L(top) + + + adcl $0, %ecx + popl %ebp + + movl %ecx, %eax + popl %ebx + + popl %edi + popl %esi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/mul_2.asm b/gmp-6.3.0/mpn/x86/pentium/mul_2.asm new file mode 100644 index 0000000..4c7beb5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/mul_2.asm @@ -0,0 +1,150 @@ +dnl Intel Pentium mpn_mul_2 -- mpn by 2-limb multiplication. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 24.0 cycles/limb + + +C mp_limb_t mpn_mul_2 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_srcptr mult); +C +C At 24 c/l this is only 2 cycles faster than a separate mul_1 and addmul_1, +C but has the advantage of making just one pass over the operands. +C +C There's not enough registers to use PARAM_MULT directly, so the multiplier +C limbs are transferred to local variables on the stack. + +defframe(PARAM_MULT, 16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(VAR_MULT_LOW, `PARAM_SRC') +define(VAR_MULT_HIGH,`PARAM_DST') + + TEXT + ALIGN(8) +PROLOGUE(mpn_mul_2) +deflit(`FRAME',0) + + pushl %esi FRAME_pushl() + pushl %edi FRAME_pushl() + + movl PARAM_SRC, %esi + movl PARAM_DST, %edi + + movl PARAM_MULT, %eax + movl PARAM_SIZE, %ecx + + movl 4(%eax), %edx C mult high + movl (%eax), %eax C mult low + + movl %eax, VAR_MULT_LOW + movl %edx, VAR_MULT_HIGH + + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() + + mull (%esi) C src[0] * mult[0] + + movl %eax, %ebp C in case src==dst + movl (%esi), %eax C src[0] + + movl %ebp, (%edi) C dst[0] + movl %edx, %ebx C initial low carry + + xorl %ebp, %ebp C initial high carry + leal (%edi,%ecx,4), %edi C dst end + + mull VAR_MULT_HIGH C src[0] * mult[1] + + subl $2, %ecx C size-2 + js L(done) + + leal 8(%esi,%ecx,4), %esi C &src[size] + xorl $-1, %ecx C -(size-1) + + + +L(top): + C eax low prod + C ebx low carry + C ecx counter, negative + C edx high prod + C esi src end + C edi dst end + C ebp high carry (0 or -1) + + andl $1, %ebp C 1 or 0 + addl %eax, %ebx + + adcl %edx, %ebp + ASSERT(nc) + movl (%esi,%ecx,4), %eax + + mull VAR_MULT_LOW + + addl %eax, %ebx C low carry + movl (%esi,%ecx,4), %eax + + adcl %ebp, %edx C high carry + movl %ebx, (%edi,%ecx,4) + + sbbl %ebp, %ebp C new high carry, -1 or 0 + movl %edx, %ebx C new low carry + + mull VAR_MULT_HIGH + + incl %ecx + jnz L(top) + + +L(done): + andl $1, %ebp C 1 or 0 + addl %ebx, %eax + + adcl %ebp, %edx + ASSERT(nc) + movl %eax, (%edi) C store carry low + + movl %edx, %eax C return carry high + + popl %ebp + popl %ebx + + popl %edi + popl %esi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm b/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm new file mode 100644 index 0000000..e1d0f05 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm @@ -0,0 +1,142 @@ +dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication. + +dnl Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 14.2 cycles/crossproduct (approx) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); + +defframe(PARAM_YSIZE, 20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_COUNTER, -4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_mul_basecase) + + pushl %eax C dummy push for allocating stack slot + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',16) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + movl PARAM_XSIZE,%eax + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + leal (%esi,%eax,4),%esi C make xp point at end + leal (%edi,%eax,4),%edi C offset wp by xsize + negl %ecx C negate j size/index for inner loop + xorl %eax,%eax C clear carry + + ALIGN(8) +L(oop1): adcl $0,%ebx + movl (%esi,%ecx,4),%eax C load next limb at xp[j] + mull (%ebp) + addl %ebx,%eax + movl %eax,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop1) + + adcl $0,%ebx + movl PARAM_YSIZE,%eax + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + addl $4,%ebp C make ebp point to next y limb + movl PARAM_XSIZE,%ecx + negl %ecx + xorl %ebx,%ebx + + C code at 0x61 here, close enough to aligned +L(oop2): + adcl $0,%ebx + movl (%esi,%ecx,4),%eax + mull (%ebp) + addl %ebx,%eax + movl (%edi,%ecx,4),%ebx + adcl $0,%edx + addl %eax,%ebx + movl %ebx,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop2) + + adcl $0,%ebx + + movl %ebx,(%edi) + addl $4,%edi + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $4,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + popl %eax C dummy pop for deallocating stack slot + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/popcount.asm b/gmp-6.3.0/mpn/x86/pentium/popcount.asm new file mode 100644 index 0000000..0e82144 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/popcount.asm @@ -0,0 +1,146 @@ +dnl Intel P5 mpn_popcount -- mpn bit population count. + +dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: 8.0 cycles/limb + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C +C An arithmetic approach has been found to be slower than the table lookup, +C due to needing too many instructions. + +C The slightly strange quoting here helps the renaming done by tune/many.pl. +deflit(TABLE_NAME, +m4_assert_defined(`GSYM_PREFIX') +GSYM_PREFIX`'mpn_popcount``'_table') + +C FIXME: exporting the table to hamdist is incorrect as it hurt incremental +C linking. + + RODATA + ALIGN(8) + GLOBL TABLE_NAME +TABLE_NAME: +forloop(i,0,255, +` .byte m4_popcount(i) +') + +defframe(PARAM_SIZE,8) +defframe(PARAM_SRC, 4) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_popcount) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %esi FRAME_pushl() + +ifdef(`PIC',` + pushl %ebx FRAME_pushl() + pushl %ebp FRAME_pushl() +ifdef(`DARWIN',` + shll %ecx C size in byte pairs + LEA( TABLE_NAME, %ebp) + movl PARAM_SRC, %esi + xorl %eax, %eax C total + xorl %ebx, %ebx C byte + xorl %edx, %edx C byte +',` + call L(here) +L(here): + popl %ebp + shll %ecx C size in byte pairs + + addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp + movl PARAM_SRC, %esi + + xorl %eax, %eax C total + xorl %ebx, %ebx C byte + + movl TABLE_NAME@GOT(%ebp), %ebp + xorl %edx, %edx C byte +') +define(TABLE,`(%ebp,$1)') +',` +dnl non-PIC + shll %ecx C size in byte pairs + movl PARAM_SRC, %esi + + pushl %ebx FRAME_pushl() + xorl %eax, %eax C total + + xorl %ebx, %ebx C byte + xorl %edx, %edx C byte + +define(TABLE,`TABLE_NAME`'($1)') +') + + + ALIGN(8) C necessary on P55 for claimed speed +L(top): + C eax total + C ebx byte + C ecx counter, 2*size to 2 + C edx byte + C esi src + C edi + C ebp [PIC] table + + addl %ebx, %eax + movb -1(%esi,%ecx,2), %bl + + addl %edx, %eax + movb -2(%esi,%ecx,2), %dl + + movb TABLE(%ebx), %bl + decl %ecx + + movb TABLE(%edx), %dl + jnz L(top) + + +ifdef(`PIC',` + popl %ebp +') + addl %ebx, %eax + popl %ebx + + addl %edx, %eax + popl %esi + + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium/rshift.asm b/gmp-6.3.0/mpn/x86/pentium/rshift.asm new file mode 100644 index 0000000..2105c4c --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/rshift.asm @@ -0,0 +1,243 @@ +dnl Intel Pentium mpn_rshift -- mpn right shift. + +dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5,P54: 6.0 +C P55: 5.375 + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_rshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%edi),%eax + cmpl %esi,%eax + jnc L(special) C jump if res_ptr + 1 >= s_ptr + leal (%edi,%ebp,4),%eax + cmpl %eax,%esi + jnc L(special) C jump if s_ptr >= res_ptr + size + +L(normal): + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + shrdl( %cl, %ebx, %edx) + shrdl( %cl, %eax, %ebx) + movl %edx,8(%edi) + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + shrdl( %cl, %edx, %eax) + shrdl( %cl, %ebx, %edx) + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shrdl( %cl,%eax,%edx) C compute result limb + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shrl %cl,%edx C compute most significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + shrl %edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl %eax + movl %ebx,(%edi) + rcrl %edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + rcrl %ebx + movl %edx,-8(%edi) + rcrl %eax + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + rcrl %edx + movl %eax,-16(%edi) + rcrl %ebx + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl %eax + movl %ebx,-24(%edi) + rcrl %edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi C use leal not to clobber carry + leal -32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + rcrl %edx + movl %ebx,(%edi) + + leal -4(%esi),%esi C use leal not to clobber carry + leal -4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + movl $0,%eax + rcrl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm new file mode 100644 index 0000000..b11d767 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm @@ -0,0 +1,528 @@ +dnl Intel P5 mpn_sqr_basecase -- square an mpn number. + +dnl Copyright 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular +C product at around 20x20 limbs. + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Calculate src,size squared, storing the result in dst,2*size. +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the size is +C small. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + pushl %esi + pushl %ebx + + movl %eax, %ebx + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl %edx, %esi C dst[1] + + movl 4(%ebx), %eax + + mull %eax C src[1]^2 + + movl %eax, %edi C dst[2] + movl %edx, %ebp C dst[3] + + movl (%ebx), %eax + + mull 4(%ebx) C src[0]*src[1] + + addl %eax, %esi + popl %ebx + + adcl %edx, %edi + + adcl $0, %ebp + addl %esi, %eax + + adcl %edi, %edx + movl %eax, 4(%ecx) + + adcl $0, %ebp + popl %esi + + movl %edx, 8(%ecx) + movl %ebp, 12(%ecx) + + popl %edi + popl %ebp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size + + cmpl $4, %edx + pushl %ebx +deflit(`FRAME',4) + + movl PARAM_SRC, %ebx + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + C eax src low limb + C ebx src + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%ebx), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + + movl 8(%ebx), %eax + pushl %esi C risk of cache bank clash + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%ebx), %eax + + mull 4(%ebx) C src[0] * src[1] + + movl %eax, %esi + movl %edx, %edi + + movl (%ebx), %eax + + mull 8(%ebx) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%ebx), %eax + + mull 8(%ebx) C src[1] * src[2] + + xorl %ebx, %ebx + addl %eax, %ebp + + C eax + C ebx zero, will be dst[5] + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %esi, %esi + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %ebx + addl %esi, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %esi + + adcl %ebp, %esi + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl %esi, 12(%ecx) + + adcl %edx, %edi + popl %esi + + movl 20(%ecx), %eax + movl %edi, 16(%ecx) + + popl %edi + popl %ebp + + adcl %ebx, %eax C no carry out of this + popl %ebx + + movl %eax, 20(%ecx) + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(four_or_more): + C eax src low limb + C ebx src + C ecx dst + C edx size + C esi + C edi + C ebp + C + C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + +deflit(`FRAME',4) + + pushl %edi +FRAME_pushl() + pushl %esi +FRAME_pushl() + + pushl %ebp +FRAME_pushl() + leal (%ecx,%edx,4), %edi C dst end of this mul1 + + leal (%ebx,%edx,4), %esi C src end + movl %ebx, %ebp C src + + negl %edx C -size + xorl %ebx, %ebx C clear carry limb and carry flag + + leal 1(%edx), %ecx C -(size-1) + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp src + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(mul1) + + + C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for + C n=1..size-2. + C + C The last two products, which are the end corner of the product + C triangle, are handled separately to save looping overhead. These + C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. + C If size is 4 then it's only these that need to be done. + C + C In the outer loop %esi is a constant, and %edi just advances by 1 + C limb each time. The size of the operation decreases by 1 limb + C each time. + + C eax + C ebx carry (needing carry flag added) + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + adcl $0, %ebx + movl PARAM_SIZE, %edx + + movl %ebx, (%edi) + subl $4, %edx + + negl %edx + jz L(corner) + + +L(outer): + C ebx previous carry limb to store + C edx outer loop counter (negative) + C esi &src[size] + C edi dst, pointing at stored carry limb of previous loop + + pushl %edx C new outer loop counter + leal -2(%edx), %ecx + + movl %ebx, (%edi) + addl $4, %edi + + addl $4, %ebp + xorl %ebx, %ebx C initial carry limb, clear carry flag + +L(inner): + C eax scratch + C ebx carry (needing carry flag added) + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi dst end of this addmul + C ebp &src[j] + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %ebx, %eax + movl (%edi,%ecx,4), %ebx + + adcl $0, %edx + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(inner) + + + adcl $0, %ebx + popl %edx C outer loop counter + + incl %edx + jnz L(outer) + + + movl %ebx, (%edi) + +L(corner): + C esi &src[size] + C edi &dst[2*size-4] + + movl -8(%esi), %eax + movl -4(%edi), %ebx C risk of data cache bank clash here + + mull -12(%esi) C src[size-2]*src[size-3] + + addl %eax, %ebx + movl %edx, %ecx + + adcl $0, %ecx + movl -4(%esi), %eax + + mull -12(%esi) C src[size-1]*src[size-3] + + addl %ecx, %eax + movl (%edi), %ecx + + adcl $0, %edx + movl %ebx, -4(%edi) + + addl %eax, %ecx + movl %edx, %ebx + + adcl $0, %ebx + movl -4(%esi), %eax + + mull -8(%esi) C src[size-1]*src[size-2] + + movl %ecx, (%edi) + addl %eax, %ebx + + adcl $0, %edx + movl PARAM_SIZE, %eax + + negl %eax + movl %ebx, 4(%edi) + + addl $1, %eax C -(size-1) and clear carry + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift): + C eax counter, negative + C ebx next limb + C ecx + C edx + C esi + C edi &dst[2*size-4] + C ebp + + movl 12(%edi,%eax,8), %ebx + + rcll %ebx + movl 16(%edi,%eax,8), %ecx + + rcll %ecx + movl %ebx, 12(%edi,%eax,8) + + movl %ecx, 16(%edi,%eax,8) + incl %eax + + jnz L(lshift) + + + adcl %eax, %eax C high bit out + movl PARAM_SRC, %esi + + movl PARAM_SIZE, %ecx C risk of cache bank clash + movl %eax, 12(%edi) C dst most significant limb + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + leal (%esi,%ecx,4), %esi C src end + + negl %ecx + + mull %eax + + movl %eax, 16(%edi,%ecx,8) C dst[0] + movl %edx, %ebx + + addl $1, %ecx C size-1 and clear carry + +L(diag): + C eax scratch (low product) + C ebx carry limb + C ecx counter, negative + C edx scratch (high product) + C esi &src[size] + C edi &dst[2*size-4] + C ebp scratch (fetched dst limbs) + + movl (%esi,%ecx,4), %eax + adcl $0, %ebx + + mull %eax + + movl 16-4(%edi,%ecx,8), %ebp + + addl %ebp, %ebx + movl 16(%edi,%ecx,8), %ebp + + adcl %eax, %ebp + movl %ebx, 16-4(%edi,%ecx,8) + + movl %ebp, 16(%edi,%ecx,8) + incl %ecx + + movl %edx, %ebx + jnz L(diag) + + + adcl $0, %edx + movl 16-4(%edi), %eax C dst most significant limb + + addl %eax, %edx + popl %ebp + + movl %edx, 16-4(%edi) + popl %esi C risk of cache bank clash + + popl %edi + popl %ebx + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/README b/gmp-6.3.0/mpn/x86/pentium4/README new file mode 100644 index 0000000..90f752e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/README @@ -0,0 +1,124 @@ +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + INTEL PENTIUM-4 MPN SUBROUTINES + + +This directory contains mpn functions optimized for Intel Pentium-4. + +The mmx subdirectory has routines using MMX instructions, the sse2 +subdirectory has routines using SSE2 instructions. All P4s have these, the +separate directories are just so configure can omit that code if the +assembler doesn't support it. + + +STATUS + + cycles/limb + + mpn_add_n/sub_n 4 normal, 6 in-place + + mpn_mul_1 4 normal, 6 in-place + mpn_addmul_1 6 + mpn_submul_1 7 + + mpn_mul_basecase 6 cycles/crossproduct (approx) + + mpn_sqr_basecase 3.5 cycles/crossproduct (approx) + or 7.0 cycles/triangleproduct (approx) + + mpn_l/rshift 1.75 + + + +The shifts ought to be able to go at 1.5 c/l, but not much effort has been +applied to them yet. + +In-place operations, and all addmul, submul, mul_basecase and sqr_basecase +calls, suffer from pipeline anomalies associated with write combining and +movd reads and writes to the same or nearby locations. The movq +instructions do not trigger the same hardware problems. Unfortunately, +using movq and splitting/combining seems to require too many extra +instructions to help. Perhaps future chip steppings will be better. + + + +NOTES + +The Pentium-4 pipeline "Netburst", provides for quite a number of surprises. +Many traditional x86 instructions run very slowly, requiring use of +alterative instructions for acceptable performance. + +adcl and sbbl are quite slow at 8 cycles for reg->reg. paddq of 32-bits +within a 64-bit mmx register seems better, though the combination +paddq/psrlq when propagating a carry is still a 4 cycle latency. + +incl and decl should be avoided, instead use add $1 and sub $1. Apparently +the carry flag is not separately renamed, so incl and decl depend on all +previous flags-setting instructions. + +shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest +integer instructions (addl, subl, orl, andl, and some more). shldl and +shrdl seem to have 13 and 15 cycles latency, respectively. Bizarre. + +movq mmx -> mmx does have 6 cycle latency, as noted in the documentation. +pxor/por or similar combination at 2 cycles latency can be used instead. +The movq however executes in the float unit, thereby saving MMX execution +resources. With the right juggling, data moves shouldn't be on a dependent +chain. + +L1 is write-through, but the write-combining sounds like it does enough to +not require explicit destination prefetching. + +xmm registers so far haven't found a use, but not much effort has been +expended. A configure test for whether the operating system knows +fxsave/fxrestor will be needed if they're used. + + + +REFERENCES + +Intel Pentium-4 processor manuals, + + http://developer.intel.com/design/pentium4/manuals + +"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001, +order number 248966. Available on-line: + + http://developer.intel.com/design/pentium4/manuals/248966.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/gmp-6.3.0/mpn/x86/pentium4/copyd.asm b/gmp-6.3.0/mpn/x86/pentium4/copyd.asm new file mode 100644 index 0000000..82af81c --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/copyd.asm @@ -0,0 +1,71 @@ +dnl Pentium-4 mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The std/rep/movsl/cld is very slow for small blocks on pentium4. Its +dnl startup time seems to be about 165 cycles. It then needs 2.6 c/l. +dnl We therefore use an open-coded 2 c/l copying loop. + +dnl Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some +dnl nifty unrolled arrangement. Clearly, that could reach much higher +dnl speeds, at least for large blocks. + +include(`../config.m4') + + +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_copyd) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + movl %ebx, PARAM_SIZE + addl $-1, %ecx + js L(end) + +L(loop): + movl (%eax,%ecx,4), %ebx + movl %ebx, (%edx,%ecx,4) + addl $-1, %ecx + + jns L(loop) +L(end): + movl PARAM_SIZE, %ebx + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/copyi.asm b/gmp-6.3.0/mpn/x86/pentium4/copyi.asm new file mode 100644 index 0000000..b614887 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/copyi.asm @@ -0,0 +1,93 @@ +dnl Pentium-4 mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 1999-2001 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl The rep/movsl is very slow for small blocks on pentium4. Its startup +dnl time seems to be about 110 cycles. It then copies at a rate of one +dnl limb per cycle. We therefore fall back to an open-coded 2 c/l copying +dnl loop for smaller sizes. + +dnl Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some +dnl nifty unrolled arrangement. Clearly, that could reach much higher +dnl speeds, at least for large blocks. + +include(`../config.m4') + + +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) + +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + cmpl $150, %ecx + jg L(replmovs) + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + movl %ebx, PARAM_SIZE + testl %ecx, %ecx + jz L(end) + +L(loop): + movl (%eax), %ebx + leal 4(%eax), %eax + addl $-1, %ecx + movl %ebx, (%edx) + leal 4(%edx), %edx + + jnz L(loop) + +L(end): + movl PARAM_SIZE, %ebx + ret + +L(replmovs): + cld C better safe than sorry, see mpn/x86/README + + movl %esi, %eax + movl PARAM_SRC, %esi + movl %edi, %edx + movl PARAM_DST, %edi + + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm b/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm new file mode 100644 index 0000000..b5eca66 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/mmx/lshift.asm @@ -0,0 +1,39 @@ +dnl Intel Pentium-4 mpn_lshift -- left shift. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4 Willamette, Northwood: 1.75 cycles/limb +C P4 Prescott: 2.0 cycles/limb + + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86/pentium/mmx/lshift.asm') diff --git a/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm b/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm new file mode 100644 index 0000000..9563cb5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/mmx/popham.asm @@ -0,0 +1,203 @@ +dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. + +dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C popcount hamdist +C P3 model 9 (Banias) ? ? +C P3 model 13 (Dothan) 6 6 +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) 8 9 +C P4 model 3 (Prescott) 8 9 +C P4 model 4 (Nocona) + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided. +C Two movd's and a punpckldq seems to be the same speed as an aligned movq, +C and using them saves fiddling about with alignment testing on entry. +C +C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l +C might be possible, but 8 c/l relying on out-of-order execution is already +C quite reasonable. + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined +')')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + RODATA + ALIGN(8) +L(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA +L(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 +L(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F +') + + TEXT + ALIGN(16) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + +ifdef(`PIC',` + movl $0xAAAAAAAA, %edx + movd %edx, %mm7 + punpckldq %mm7, %mm7 + + movl $0x33333333, %edx + movd %edx, %mm6 + punpckldq %mm6, %mm6 + + movl $0x0F0F0F0F, %edx + movd %edx, %mm5 + punpckldq %mm5, %mm5 + +HAM(` movl PARAM_SRC2, %edx') + +',` + dnl non-PIC +HAM(` movl PARAM_SRC2, %edx') + movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq L(rodata_3333333333333333), %mm6 + movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 +') + + pxor %mm4, %mm4 C zero + pxor %mm0, %mm0 C total + + subl $1, %ecx + ja L(top) + +L(last): + movd (%eax,%ecx,4), %mm1 C src high limb +HAM(` movd (%edx,%ecx,4), %mm2 + pxor %mm2, %mm1 +') + jmp L(loaded) + + +L(top): + C eax src + C ebx + C ecx counter, size-1 to 2 or 1, inclusive + C edx [hamdist] src2 + C + C mm0 total (low dword) + C mm1 (scratch) + C mm2 (scratch) + C mm3 + C mm4 0x0000000000000000 + C mm5 0x0F0F0F0F0F0F0F0F + C mm6 0x3333333333333333 + C mm7 0xAAAAAAAAAAAAAAAA + + movd (%eax), %mm1 + movd 4(%eax), %mm2 + punpckldq %mm2, %mm1 + addl $8, %eax + +HAM(` movd (%edx), %mm2 + movd 4(%edx), %mm3 + punpckldq %mm3, %mm2 + pxor %mm2, %mm1 + addl $8, %edx +') + +L(loaded): + movq %mm7, %mm2 + pand %mm1, %mm2 + psrlq $1, %mm2 + psubd %mm2, %mm1 C bit pairs + + movq %mm6, %mm2 + pand %mm1, %mm2 + psrlq $2, %mm1 + pand %mm6, %mm1 + paddd %mm2, %mm1 C nibbles + + movq %mm5, %mm2 + pand %mm1, %mm2 + psrlq $4, %mm1 + pand %mm5, %mm1 + paddd %mm2, %mm1 C bytes + + psadbw( %mm4, %mm1) + paddd %mm1, %mm0 C to total + + subl $2, %ecx + jg L(top) + + C ecx is 0 or -1 representing respectively 1 or 0 further limbs + jz L(last) + + + movd %mm0, %eax + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm b/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm new file mode 100644 index 0000000..3ac0094 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/mmx/rshift.asm @@ -0,0 +1,39 @@ +dnl Intel Pentium-4 mpn_rshift -- right shift. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4 Willamette, Northwood: 1.75 cycles/limb +C P4 Prescott: 2.0 cycles/limb + + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86/pentium/mmx/rshift.asm') diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm new file mode 100644 index 0000000..8e2380e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm @@ -0,0 +1,101 @@ +dnl Intel Pentium-4 mpn_add_n -- mpn addition. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C dst!=src1,2 dst==src1 dst==src2 +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 4 6 6 +C P4 model 3-4 (Prescott) 4.25 7.5 7.5 + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + + TEXT + ALIGN(8) + +PROLOGUE(mpn_add_nc) +deflit(`FRAME',0) + movd PARAM_CARRY, %mm0 + jmp L(start_nc) +EPILOGUE() + + ALIGN(8) +PROLOGUE(mpn_add_n) +deflit(`FRAME',0) + pxor %mm0, %mm0 +L(start_nc): + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + mov PARAM_SRC2, %ebx + mov PARAM_DST, %edx + mov PARAM_SIZE, %ecx + + lea (%eax,%ecx,4), %eax C src1 end + lea (%ebx,%ecx,4), %ebx C src2 end + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): + C eax src1 end + C ebx src2 end + C ecx counter, limbs, negative + C edx dst end + C mm0 carry bit + + movd (%eax,%ecx,4), %mm1 + movd (%ebx,%ecx,4), %mm2 + paddq %mm2, %mm1 + + paddq %mm1, %mm0 + movd %mm0, (%edx,%ecx,4) + + psrlq $32, %mm0 + + add $1, %ecx + jnz L(top) + + movd %mm0, %eax + mov SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm new file mode 100644 index 0000000..93b63b2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm @@ -0,0 +1,108 @@ +dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y. + +dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C dst!=src1,2 dst==src1 dst==src2 +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 4.25 6 6 +C P4 model 3-4 (Prescott) 5 8.5 8.5 + +C The slightly strange combination of indexing and pointer incrementing +C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or +C src2 is a slowdown. +C +C The dependent chain is simply the paddq of x+2*y to the previous carry, +C then psrlq to get the new carry. That makes 4 c/l the target speed, which +C is almost achieved for separate src/dst but when src==dst the write +C combining anomalies slow it down. + +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + + TEXT + ALIGN(8) + +PROLOGUE(mpn_addlsh1_n) +deflit(`FRAME',0) + + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + + mov PARAM_SRC2, %ebx + pxor %mm0, %mm0 C initial carry + + mov PARAM_DST, %edx + + mov PARAM_SIZE, %ecx + + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): + C eax src1 end + C ebx src2 end + C ecx counter, limbs, negative + C edx dst end + C mm0 carry + + movd (%ebx), %mm2 + movd (%eax), %mm1 + psrlq $32, %mm0 + lea 4(%eax), %eax + lea 4(%ebx), %ebx + + psllq $1, %mm2 + paddq %mm2, %mm1 + + paddq %mm1, %mm0 + + movd %mm0, (%edx,%ecx,4) + add $1, %ecx + jnz L(top) + + + psrlq $32, %mm0 + mov SAVE_EBX, %ebx + movd %mm0, %eax + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm new file mode 100644 index 0000000..7810207 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm @@ -0,0 +1,189 @@ +dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) 5.24 +C P6 model 13 (Dothan) 5.24 +C P4 model 0-1 (Willamette) 5 +C P4 model 2 (Northwood) 5 +C P4 model 3-4 (Prescott) 5 + +C TODO: +C * Tweak eax/edx offsets in loop as to save some lea's +C * Perhaps software pipeline small-case code + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C n sp + 12 +C v0 sp + 16 + + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_1) + pxor %mm6, %mm6 +L(ent): mov 4(%esp), %edx + mov 8(%esp), %eax + mov 12(%esp), %ecx + movd 16(%esp), %mm7 + cmp $4, %ecx + jnc L(big) + +L(lp0): movd (%eax), %mm0 + lea 4(%eax), %eax + movd (%edx), %mm4 + lea 4(%edx), %edx + pmuludq %mm7, %mm0 + paddq %mm0, %mm4 + paddq %mm4, %mm6 + movd %mm6, -4(%edx) + psrlq $32, %mm6 + dec %ecx + jnz L(lp0) + movd %mm6, %eax + emms + ret + +L(big): and $3, %ecx + je L(0) + cmp $2, %ecx + jc L(1) + je L(2) + jmp L(3) C FIXME: one case should fall through + +L(0): movd (%eax), %mm3 + sub 12(%esp), %ecx C loop count + lea -16(%eax), %eax + lea -12(%edx), %edx + pmuludq %mm7, %mm3 + movd 20(%eax), %mm0 + movd 12(%edx), %mm5 + pmuludq %mm7, %mm0 + movd 24(%eax), %mm1 + paddq %mm3, %mm5 + movd 16(%edx), %mm4 + jmp L(00) + +L(1): movd (%eax), %mm2 + sub 12(%esp), %ecx + lea -12(%eax), %eax + lea -8(%edx), %edx + movd 8(%edx), %mm4 + pmuludq %mm7, %mm2 + movd 16(%eax), %mm3 + pmuludq %mm7, %mm3 + movd 20(%eax), %mm0 + paddq %mm2, %mm4 + movd 12(%edx), %mm5 + jmp L(01) + +L(2): movd (%eax), %mm1 + sub 12(%esp), %ecx + lea -8(%eax), %eax + lea -4(%edx), %edx + pmuludq %mm7, %mm1 + movd 12(%eax), %mm2 + movd 4(%edx), %mm5 + pmuludq %mm7, %mm2 + movd 16(%eax), %mm3 + paddq %mm1, %mm5 + movd 8(%edx), %mm4 + jmp L(10) + +L(3): movd (%eax), %mm0 + sub 12(%esp), %ecx + lea -4(%eax), %eax + pmuludq %mm7, %mm0 + movd 8(%eax), %mm1 + movd (%edx), %mm4 + pmuludq %mm7, %mm1 + movd 12(%eax), %mm2 + paddq %mm0, %mm4 + movd 4(%edx), %mm5 + + ALIGN(16) +L(top): pmuludq %mm7, %mm2 + paddq %mm4, %mm6 + movd 16(%eax), %mm3 + paddq %mm1, %mm5 + movd 8(%edx), %mm4 + movd %mm6, 0(%edx) + psrlq $32, %mm6 +L(10): pmuludq %mm7, %mm3 + paddq %mm5, %mm6 + movd 20(%eax), %mm0 + paddq %mm2, %mm4 + movd 12(%edx), %mm5 + movd %mm6, 4(%edx) + psrlq $32, %mm6 +L(01): pmuludq %mm7, %mm0 + paddq %mm4, %mm6 + movd 24(%eax), %mm1 + paddq %mm3, %mm5 + movd 16(%edx), %mm4 + movd %mm6, 8(%edx) + psrlq $32, %mm6 +L(00): pmuludq %mm7, %mm1 + paddq %mm5, %mm6 + movd 28(%eax), %mm2 + paddq %mm0, %mm4 + movd 20(%edx), %mm5 + movd %mm6, 12(%edx) + psrlq $32, %mm6 + lea 16(%eax), %eax + lea 16(%edx), %edx + add $4, %ecx + jnz L(top) + +L(end): pmuludq %mm7, %mm2 + paddq %mm4, %mm6 + paddq %mm1, %mm5 + movd 8(%edx), %mm4 + movd %mm6, 0(%edx) + psrlq $32, %mm6 + paddq %mm5, %mm6 + paddq %mm2, %mm4 + movd %mm6, 4(%edx) + psrlq $32, %mm6 + paddq %mm4, %mm6 + movd %mm6, 8(%edx) + psrlq $32, %mm6 + movd %mm6, %eax + emms + ret +EPILOGUE() +PROLOGUE(mpn_addmul_1c) + movd 20(%esp), %mm6 + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm new file mode 100644 index 0000000..354300e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm @@ -0,0 +1,141 @@ +dnl Intel Atom mpn_bdiv_dbm1. + +dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C cycles/limb +C P5 - +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) 9.75 +C P6 model 13 (Dothan) +C P4 model 0 (Willamette) +C P4 model 1 (?) +C P4 model 2 (Northwood) 8.25 +C P4 model 3 (Prescott) +C P4 model 4 (Nocona) +C Intel Atom 8 +C AMD K6 - +C AMD K7 - +C AMD K8 +C AMD K10 + +C TODO: This code was optimised for atom-32, consider moving it back to atom +C dir(atom currently grabs this code), and write a 4-way version(7c/l). + +defframe(PARAM_CARRY,20) +defframe(PARAM_MUL, 16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_RP,`PARAM_MUL') +define(SAVE_UP,`PARAM_SIZE') + +define(`rp', `%edi') +define(`up', `%esi') +define(`n', `%ecx') +define(`reg', `%edx') +define(`cy', `%eax') C contains the return value + +ASM_START() + TEXT + ALIGN(16) +deflit(`FRAME',0) + +PROLOGUE(mpn_bdiv_dbm1c) + mov PARAM_SIZE, n C size + mov up, SAVE_UP + mov PARAM_SRC, up + movd PARAM_MUL, %mm7 + mov rp, SAVE_RP + mov PARAM_DST, rp + + movd (up), %mm0 + pmuludq %mm7, %mm0 + shr n + mov PARAM_CARRY, cy + jz L(eq1) + + movd 4(up), %mm1 + jc L(odd) + + lea 4(up), up + pmuludq %mm7, %mm1 + movd %mm0, reg + psrlq $32, %mm0 + sub reg, cy + movd %mm0, reg + movq %mm1, %mm0 + dec n + mov cy, (rp) + lea 4(rp), rp + jz L(end) + +C ALIGN(16) +L(top): movd 4(up), %mm1 + sbb reg, cy +L(odd): movd %mm0, reg + psrlq $32, %mm0 + pmuludq %mm7, %mm1 + sub reg, cy + lea 8(up), up + movd %mm0, reg + movd (up), %mm0 + mov cy, (rp) + sbb reg, cy + movd %mm1, reg + psrlq $32, %mm1 + sub reg, cy + movd %mm1, reg + pmuludq %mm7, %mm0 + dec n + mov cy, 4(rp) + lea 8(rp), rp + jnz L(top) + +L(end): sbb reg, cy + +L(eq1): movd %mm0, reg + psrlq $32, %mm0 + mov SAVE_UP, up + sub reg, cy + movd %mm0, reg + emms + mov cy, (rp) + sbb reg, cy + + mov SAVE_RP, rp + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm new file mode 100644 index 0000000..d5008f4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm @@ -0,0 +1,234 @@ +dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. + +dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + +C Pairs of movd's are used to avoid unaligned loads. Despite the loads not +C being on the dependent chain and there being plenty of cycles available, +C using an unaligned movq on every second iteration measured about 23 c/l. +C + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + ALIGN(32) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + movl PARAM_DIVISOR, %ecx + + movd %ecx, %mm6 + movl PARAM_SHIFT, %ecx + + movd %ecx, %mm7 C shift + + C + + movl PARAM_INVERSE, %ecx + movd %ecx, %mm5 C inv + + movl PARAM_DST, %ecx + pxor %mm1, %mm1 C initial carry limb + pxor %mm0, %mm0 C initial carry bit + + subl $1, %edx + jz L(done) + + pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + +C The dependent chain here is as follows. +C +C latency +C psubq s = (src-cbit) - climb 2 +C pmuludq q = s*inverse 8 +C pmuludq prod = q*divisor 8 +C psrlq climb = high(prod) 2 +C -- +C 20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): + C eax src, incrementing + C ebx + C ecx dst, incrementing + C edx counter, size-1 iterations + C + C mm0 carry bit + C mm1 carry limb + C mm4 0x00000000FFFFFFFF + C mm5 inverse + C mm6 divisor + C mm7 shift + + movd (%eax), %mm2 + movd 4(%eax), %mm3 + addl $4, %eax + punpckldq %mm3, %mm2 + + psrlq %mm7, %mm2 + pand %mm4, %mm2 C src + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + movq %mm2, %mm0 + psrlq $63, %mm0 C new cbit + + pmuludq %mm5, %mm2 C s*inverse + movd %mm2, (%ecx) C q + addl $4, %ecx + + movq %mm6, %mm1 + pmuludq %mm2, %mm1 C q*divisor + psrlq $32, %mm1 C new climb + +L(entry): + subl $1, %edx + jnz L(top) + +L(done): + movd (%eax), %mm2 + psrlq %mm7, %mm2 C src + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + + pmuludq %mm5, %mm2 C s*inverse + movd %mm2, (%ecx) C q + + emms + ret + +EPILOGUE() + + ALIGN(16) +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_DIVISOR, %ecx + + C eax src + C ebx + C ecx divisor + C edx size-1 + + movl %ecx, %eax + bsfl %ecx, %ecx C trailing twos + + shrl %cl, %eax C d = divisor without twos + movd %eax, %mm6 + movd %ecx, %mm7 C shift + + shrl %eax C d/2 + + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %ecx) + movzbl (%eax,%ecx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + C + + movd %eax, %mm5 C inv + + movd %eax, %mm0 C inv + + pmuludq %mm5, %mm5 C inv*inv + + C + + pmuludq %mm6, %mm5 C inv*inv*d + paddd %mm0, %mm0 C 2*inv + + C + + psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d + pxor %mm5, %mm5 + + paddd %mm0, %mm5 + pmuludq %mm0, %mm0 C inv*inv + + pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + + C + + pmuludq %mm6, %mm0 C inv*inv*d + paddd %mm5, %mm5 C 2*inv + + movl PARAM_SRC, %eax + movl PARAM_DST, %ecx + pxor %mm1, %mm1 C initial carry limb + + C + + psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + movq %mm6, %mm0 + pmuludq %mm5, %mm0 + movd %mm0, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + pxor %mm0, %mm0 C initial carry bit + jmp L(entry) + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm new file mode 100644 index 0000000..b3f3474 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm @@ -0,0 +1,95 @@ +dnl Intel Pentium-4 mpn_cnd_add_n -- mpn addition. + +dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4.67 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 5 +C P4 model 3-4 (Prescott) 5.25 + +defframe(PARAM_SIZE, 20) +defframe(PARAM_SRC2, 16) +defframe(PARAM_SRC1, 12) +defframe(PARAM_DST, 8) +defframe(PARAM_CND, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +define(`cnd', `%mm3') + + TEXT + ALIGN(8) + + ALIGN(8) +PROLOGUE(mpn_cnd_add_n) +deflit(`FRAME',0) + pxor %mm0, %mm0 + + mov PARAM_CND, %eax + neg %eax + sbb %eax, %eax + movd %eax, cnd + + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + mov PARAM_SRC2, %ebx + mov PARAM_DST, %edx + mov PARAM_SIZE, %ecx + + lea (%eax,%ecx,4), %eax C src1 end + lea (%ebx,%ecx,4), %ebx C src2 end + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): movd (%ebx,%ecx,4), %mm2 + movd (%eax,%ecx,4), %mm1 + pand cnd, %mm2 + paddq %mm2, %mm1 + + paddq %mm1, %mm0 + movd %mm0, (%edx,%ecx,4) + + psrlq $32, %mm0 + + add $1, %ecx + jnz L(top) + + movd %mm0, %eax + mov SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm new file mode 100644 index 0000000..339a23e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm @@ -0,0 +1,114 @@ +dnl Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction. + +dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4.67 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 5 +C P4 model 3-4 (Prescott) 5.25 + +defframe(PARAM_SIZE, 20) +defframe(PARAM_SRC2, 16) +defframe(PARAM_SRC1, 12) +defframe(PARAM_DST, 8) +defframe(PARAM_CND, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + +define(`cnd', `%mm3') + + TEXT + ALIGN(8) + + ALIGN(8) +PROLOGUE(mpn_cnd_sub_n) +deflit(`FRAME',0) + pxor %mm0, %mm0 + + mov PARAM_CND, %eax + neg %eax + sbb %eax, %eax + movd %eax, cnd + + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + mov PARAM_SRC2, %ebx + mov PARAM_DST, %edx + mov PARAM_SIZE, %ecx + + lea (%eax,%ecx,4), %eax C src1 end + lea (%ebx,%ecx,4), %ebx C src2 end + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): movd (%ebx,%ecx,4), %mm2 + movd (%eax,%ecx,4), %mm1 + pand cnd, %mm2 + psubq %mm2, %mm1 + + psubq %mm0, %mm1 + movd %mm1, (%edx,%ecx,4) + + psrlq $63, %mm1 + + add $1, %ecx + jz L(done_mm1) + + movd (%ebx,%ecx,4), %mm2 + movd (%eax,%ecx,4), %mm0 + pand cnd, %mm2 + psubq %mm2, %mm0 + + psubq %mm1, %mm0 + movd %mm0, (%edx,%ecx,4) + + psrlq $63, %mm0 + + add $1, %ecx + jnz L(top) + + movd %mm0, %eax + mov SAVE_EBX, %ebx + emms + ret + +L(done_mm1): + movd %mm1, %eax + mov SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm new file mode 100644 index 0000000..0ceef5b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm @@ -0,0 +1,216 @@ +dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + + +C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C +C Pairs of movd's are used to avoid unaligned loads. Despite the loads not +C being on the dependent chain and there being plenty of cycles available, +C using an unaligned movq on every second iteration measured about 23 c/l. +C +C Using divl for size==1 seems a touch quicker than mul-by-inverse. The mul +C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of +C that might be hidden by out-of-order execution, whereas divl is around 60. +C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul +C faster. + +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_divexact_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + movl PARAM_DIVISOR, %ecx + subl $1, %edx + jnz L(two_or_more) + + movl (%eax), %eax + xorl %edx, %edx + + divl %ecx + movl PARAM_DST, %ecx + + movl %eax, (%ecx) + ret + + +L(two_or_more): + C eax src + C ebx + C ecx divisor + C edx size-1 + + movl %ecx, %eax + bsfl %ecx, %ecx C trailing twos + + shrl %cl, %eax C d = divisor without twos + movd %eax, %mm6 + movd %ecx, %mm7 C shift + + shrl %eax C d/2 + + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %ecx) + movzbl (%eax,%ecx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + C + + movd %eax, %mm5 C inv + + movd %eax, %mm0 C inv + + pmuludq %mm5, %mm5 C inv*inv + + C + + pmuludq %mm6, %mm5 C inv*inv*d + paddd %mm0, %mm0 C 2*inv + + C + + psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d + pxor %mm5, %mm5 + + paddd %mm0, %mm5 + pmuludq %mm0, %mm0 C inv*inv + + pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + + C + + pmuludq %mm6, %mm0 C inv*inv*d + paddd %mm5, %mm5 C 2*inv + + movl PARAM_SRC, %eax + movl PARAM_DST, %ecx + pxor %mm1, %mm1 C initial carry limb + + C + + psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + movq %mm6, %mm0 + pmuludq %mm5, %mm0 + movd %mm0, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + pxor %mm0, %mm0 C initial carry bit + + +C The dependent chain here is as follows. +C +C latency +C psubq s = (src-cbit) - climb 2 +C pmuludq q = s*inverse 8 +C pmuludq prod = q*divisor 8 +C psrlq climb = high(prod) 2 +C -- +C 20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): + C eax src, incrementing + C ebx + C ecx dst, incrementing + C edx counter, size-1 iterations + C + C mm0 carry bit + C mm1 carry limb + C mm4 0x00000000FFFFFFFF + C mm5 inverse + C mm6 divisor + C mm7 shift + + movd (%eax), %mm2 + movd 4(%eax), %mm3 + addl $4, %eax + punpckldq %mm3, %mm2 + + psrlq %mm7, %mm2 + pand %mm4, %mm2 C src + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + movq %mm2, %mm0 + psrlq $63, %mm0 C new cbit + + pmuludq %mm5, %mm2 C s*inverse + movd %mm2, (%ecx) C q + addl $4, %ecx + + movq %mm6, %mm1 + pmuludq %mm2, %mm1 C q*divisor + psrlq $32, %mm1 C new climb + + subl $1, %edx + jnz L(top) + + +L(done): + movd (%eax), %mm2 + psrlq %mm7, %mm2 C src + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + + pmuludq %mm5, %mm2 C s*inverse + movd %mm2, (%ecx) C q + + emms + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm new file mode 100644 index 0000000..0146fab --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm @@ -0,0 +1,645 @@ +dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 1999-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part. + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); +C +C Algorithm: +C +C The method and nomenclature follow part 8 of "Division by Invariant +C Integers using Multiplication" by Granlund and Montgomery, reference in +C gmp.texi. +C +C "m" is written for what is m' in the paper, and "d" for d_norm, which +C won't cause any confusion since it's only the normalized divisor that's of +C any use in the code. "b" is written for 2^N, the size of a limb, N being +C 32 here. +C +C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as +C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets +C us have just a psubq on the dependent chain. +C +C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here, +C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much. +C +C Notes: +C +C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high +C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero +C carry, since in normal circumstances that will be a very rare event. +C +C The test for skipping a division is branch free (once size>=1 is tested). +C The store to the destination high limb is 0 when a divide is skipped, or +C if it's not skipped then a copy of the src high limb is stored. The +C latter is in case src==dst. +C +C There's a small bias towards expecting xsize==0, by having code for +C xsize==0 in a straight line and xsize!=0 under forward jumps. +C +C Enhancements: +C +C The loop measures 32 cycles, but the dependent chain would suggest it +C could be done with 30. Not sure where to start looking for the extras. +C +C Alternatives: +C +C If the divisor is normalized (high bit set) then a division step can +C always be skipped, since the high destination limb is always 0 or 1 in +C that case. It doesn't seem worth checking for this though, since it +C probably occurs infrequently. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The inverse takes about 80-90 cycles to calculate, but after that the +dnl multiply is 32 c/l versus division at about 58 c/l. +dnl +dnl At 4 limbs the div is a touch faster than the mul (and of course +dnl simpler), so start the mul from 5 limbs. + +deflit(MUL_THRESHOLD, 5) + + +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_ESI,`PARAM_SIZE') +define(SAVE_EBP,`PARAM_SRC') +define(SAVE_EDI,`PARAM_DIVISOR') +define(SAVE_EBX,`PARAM_DST') + + TEXT + + ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + xorl %edx, %edx C carry if can't skip a div + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl -4(%esi,%ecx,4), %eax C src high limb + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movd PARAM_PREINV_INVERSE, %mm4 + + movd PARAM_PREINV_SHIFT, %mm7 C l + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edx) C high is carry if high new n2 + psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm3, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm3, %mm2 C q + pand %mm5, %mm3 C mask & d + + paddd %mm3, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + subl $1, %ecx + ja L(integer_top) + + +L(integer_last): + C eax + C ebx xsize + C ecx + C edx + C esi &src[0] + C edi &dst[xsize] + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 + C mm7 l + + ASSERT(b,`C n2 n2 + psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm3, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm3, %mm2 C q + pand %mm5, %mm3 C mask & d + + paddd %mm3, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + +L(integer_none): + C eax + C ebx xsize + + orl %ebx, %ebx + jnz L(fraction_some) C if xsize!=0 + + +L(fraction_done): + movl SAVE_EBP, %ebp + psrld %mm7, %mm0 C remainder + + movl SAVE_EDI, %edi + movd %mm0, %eax + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + emms + ret + + + +C ----------------------------------------------------------------------------- +C + +L(fraction_some): + C eax + C ebx xsize + C ecx + C edx + C esi + C edi &dst[xsize-1] + C ebp + + +L(fraction_top): + C eax + C ebx counter, xsize iterations + C ecx + C edx + C esi src, decrementing + C edi dst, decrementing + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 32-l + C mm7 l + + ASSERT(b,`C n2 n2 + psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm1, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm1, %mm2 C q + pand %mm5, %mm1 C mask & d + + paddd %mm1, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + subl $1, %ebx + jne L(fraction_top) + + + jmp L(fraction_done) + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h b/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h new file mode 100644 index 0000000..a047a51 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h @@ -0,0 +1,213 @@ +/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2600 MHz P4 Northwood */ +/* FFT tuning limit = 23,700,309 */ +/* Generated by tuneup.c, 2019-11-09, gcc 8.2 */ + +#define MOD_1_NORM_THRESHOLD 5 +#define MOD_1_UNNORM_THRESHOLD 14 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 2 /* 4.36% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 16 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 21 + +#define DIV_1_VS_MUL_1_PERCENT 358 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 101 +#define MUL_TOOM44_THRESHOLD 284 +#define MUL_TOOM6H_THRESHOLD 406 +#define MUL_TOOM8H_THRESHOLD 592 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 101 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 191 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 189 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 195 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 151 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 51 +#define SQR_TOOM3_THRESHOLD 163 +#define SQR_TOOM4_THRESHOLD 254 +#define SQR_TOOM6_THRESHOLD 614 +#define SQR_TOOM8_THRESHOLD 842 + +#define MULMID_TOOM42_THRESHOLD 58 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 824 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 824, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 36, 6}, { 19, 5}, { 39, 6}, \ + { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 48, 7}, { 29, 8}, \ + { 15, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 99, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 143,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 271,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335,11}, { 191,10}, { 383, 9}, { 799,10}, \ + { 415,11}, { 223,12}, { 127,11}, { 255,10}, \ + { 527,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 863,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1119, 9}, { 2239,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471, 9}, { 2943,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 927,10}, { 1855,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1119,12}, { 575,11}, \ + { 1215,10}, { 2431,11}, { 1247,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,10}, { 2943,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,10}, { 3455,12}, { 895,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,10}, { 4479,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \ + { 2687,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2495,11}, { 4991,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1535,12}, { 3135,13}, { 1663,12}, \ + { 3455,13}, { 1919,12}, { 3967,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,12}, \ + { 4991,14}, { 1279,13}, { 2687,12}, { 5503,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 167 +#define MUL_FFT_THRESHOLD 7808 + +#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 560, 5}, { 33, 6}, { 17, 5}, { 35, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 47, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 111,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127, 9}, { 511, 8}, { 1023, 9}, \ + { 527,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 351,11}, { 191,10}, { 431,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 607, 9}, { 1215,11}, { 319,10}, { 639,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1119,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 927,10}, { 1855,11}, { 991,13}, { 255,12}, \ + { 511,11}, { 1055,10}, { 2111,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \ + { 3455,12}, { 895,11}, { 1855,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2239,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,11}, { 3455,13}, \ + { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2495,11}, { 4991,13}, \ + { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,12}, { 4991,14}, { 1279,13}, \ + { 2687,12}, { 5503,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 149 +#define SQR_FFT_THRESHOLD 4800 + +#define MULLO_BASECASE_THRESHOLD 12 +#define MULLO_DC_THRESHOLD 44 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 13 +#define SQRLO_DC_THRESHOLD 42 +#define SQRLO_SQR_THRESHOLD 9449 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 105 +#define DC_BDIV_QR_THRESHOLD 52 +#define DC_BDIV_Q_THRESHOLD 83 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 158 +#define INV_APPR_THRESHOLD 118 + +#define BINV_NEWTON_THRESHOLD 342 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 2130 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 60 +#define MU_BDIV_QR_THRESHOLD 1652 +#define MU_BDIV_Q_THRESHOLD 2089 + +#define POWM_SEC_TABLE 1,22,96,446,723,1378 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 298 +#define SET_STR_PRECOMPUTE_THRESHOLD 960 + +#define FAC_DSC_THRESHOLD 212 +#define FAC_ODD_THRESHOLD 71 + +#define MATRIX22_STRASSEN_THRESHOLD 26 +#define HGCD2_DIV1_METHOD 3 /* 0.68% faster than 1 */ +#define HGCD_THRESHOLD 80 +#define HGCD_APPR_THRESHOLD 138 +#define HGCD_REDUCE_THRESHOLD 4455 +#define GCD_DC_THRESHOLD 365 +#define GCDEXT_DC_THRESHOLD 245 +#define JACOBI_BASE_METHOD 4 /* 23.41% faster than 1 */ + +/* Tuneup completed successfully, took 63807 seconds */ diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm new file mode 100644 index 0000000..ee88bab --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm @@ -0,0 +1,166 @@ +dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Optimize. The present code was written quite straightforwardly. +C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill. +C * Write a cps function that uses sse2 insns. + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 16 +C P4 model 3-4 (Prescott) 18 + +C INPUT PARAMETERS +C ap sp + 4 +C n sp + 8 +C b sp + 12 +C cps sp + 16 + +define(`B1modb', `%mm1') +define(`B2modb', `%mm2') +define(`ap', `%edx') +define(`n', `%eax') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1_1p) + push %ebx + mov 8(%esp), ap + mov 12(%esp), n + mov 20(%esp), %ecx + movd 8(%ecx), B1modb + movd 12(%ecx), B2modb + + lea -4(ap,n,4), ap + +C FIXME: See comment in generic/mod_1_1.c. + movd (ap), %mm7 + movd -4(ap), %mm4 + pmuludq B1modb, %mm7 + paddq %mm4, %mm7 + add $-2, n + jz L(end) + + ALIGN(8) +L(top): movq %mm7, %mm6 + psrlq $32, %mm7 C rh + movd -8(ap), %mm0 + add $-4, ap + pmuludq B2modb, %mm7 + pmuludq B1modb, %mm6 + add $-1, n + paddq %mm0, %mm7 + paddq %mm6, %mm7 + jnz L(top) + +L(end): pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + pand %mm7, %mm4 C rl + psrlq $32, %mm7 C rh + pmuludq B1modb, %mm7 C rh,cl + paddq %mm4, %mm7 C rh,rl + movd 4(%ecx), %mm4 C cnt + psllq %mm4, %mm7 C rh,rl normalized + movq %mm7, %mm2 C rl in low half + psrlq $32, %mm7 C rh + movd (%ecx), %mm1 C bi + pmuludq %mm7, %mm1 C qh,ql + paddq %mm2, %mm1 C qh-1,ql + movd %mm1, %ecx C ql + psrlq $32, %mm1 C qh-1 + movd 16(%esp), %mm3 C b + pmuludq %mm1, %mm3 C (qh-1) * b + psubq %mm3, %mm2 C r in low half (could use psubd) + movd %mm2, %eax C r + mov 16(%esp), %ebx + sub %ebx, %eax C r + cmp %eax, %ecx + lea (%eax,%ebx), %edx + cmovc( %edx, %eax) + movd %mm4, %ecx C cnt + cmp %ebx, %eax + jae L(fix) + emms + pop %ebx + shr %cl, %eax + ret + +L(fix): sub %ebx, %eax + emms + pop %ebx + shr %cl, %eax + ret +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps) +C CAUTION: This is the same code as in k7/mod_1_1.asm + push %ebp + mov 12(%esp), %ebp + push %esi + bsr %ebp, %ecx + push %ebx + xor $31, %ecx + mov 16(%esp), %esi + sal %cl, %ebp + mov %ebp, %edx + not %edx + mov $-1, %eax + div %ebp + mov %eax, (%esi) C store bi + mov %ecx, 4(%esi) C store cnt + xor %ebx, %ebx + sub %ebp, %ebx + mov $1, %edx + shld %cl, %eax, %edx + imul %edx, %ebx + mul %ebx + add %ebx, %edx + not %edx + imul %ebp, %edx + add %edx, %ebp + cmp %edx, %eax + cmovc( %ebp, %edx) + shr %cl, %ebx + mov %ebx, 8(%esi) C store B1modb + shr %cl, %edx + mov %edx, 12(%esi) C store B2modb + pop %ebx + pop %esi + pop %ebp + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm new file mode 100644 index 0000000..eb2edb6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm @@ -0,0 +1,269 @@ +dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F). + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Optimize. The present code was written quite straightforwardly. +C * Optimize post-loop reduction code. +C * Write a cps function that uses sse2 insns. + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 3.4 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 4 +C P4 model 3-4 (Prescott) 4.5 + +C INPUT PARAMETERS +C ap sp + 4 +C n sp + 8 +C b sp + 12 +C cps sp + 16 + +define(`B1modb', `%mm1') +define(`B2modb', `%mm2') +define(`B3modb', `%mm3') +define(`B4modb', `%mm4') +define(`B5modb', `%mm5') +define(`ap', `%edx') +define(`n', `%eax') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p) + push %ebx + mov 8(%esp), ap + mov 12(%esp), n + mov 20(%esp), %ecx + + movd 8(%ecx), B1modb + movd 12(%ecx), B2modb + movd 16(%ecx), B3modb + movd 20(%ecx), B4modb + movd 24(%ecx), B5modb + + mov n, %ebx + lea -4(ap,n,4), ap + and $3, %ebx + je L(b0) + cmp $2, %ebx + jc L(b1) + je L(b2) + +L(b3): movd -4(ap), %mm7 + pmuludq B1modb, %mm7 + movd -8(ap), %mm6 + paddq %mm6, %mm7 + movd (ap), %mm6 + pmuludq B2modb, %mm6 + paddq %mm6, %mm7 + lea -24(ap), ap + add $-3, n + jz L(end) + jmp L(top) + +L(b0): movd -8(ap), %mm7 + pmuludq B1modb, %mm7 + movd -12(ap), %mm6 + paddq %mm6, %mm7 + movd -4(ap), %mm6 + pmuludq B2modb, %mm6 + paddq %mm6, %mm7 + movd (ap), %mm6 + pmuludq B3modb, %mm6 + paddq %mm6, %mm7 + lea -28(ap), ap + add $-4, n + jz L(end) + jmp L(top) + +L(b1): movd (ap), %mm7 + lea -16(ap), ap + dec n + jz L(x) + jmp L(top) + +L(b2): movd -4(ap), %mm7 C rl + punpckldq (ap), %mm7 C rh + lea -20(ap), ap + add $-2, n + jz L(end) + + ALIGN(8) +L(top): movd 4(ap), %mm0 + pmuludq B1modb, %mm0 + movd 0(ap), %mm6 + paddq %mm6, %mm0 + + movd 8(ap), %mm6 + pmuludq B2modb, %mm6 + paddq %mm6, %mm0 + + movd 12(ap), %mm6 + pmuludq B3modb, %mm6 + paddq %mm6, %mm0 + + movq %mm7, %mm6 + psrlq $32, %mm7 C rh + pmuludq B5modb, %mm7 + pmuludq B4modb, %mm6 + + paddq %mm0, %mm7 + paddq %mm6, %mm7 + + add $-16, ap + add $-4, n + jnz L(top) + +L(end): pcmpeqd %mm4, %mm4 + psrlq $32, %mm4 C 0x00000000FFFFFFFF + pand %mm7, %mm4 C rl + psrlq $32, %mm7 C rh + pmuludq B1modb, %mm7 C rh,cl + paddq %mm4, %mm7 C rh,rl +L(x): movd 4(%ecx), %mm4 C cnt + psllq %mm4, %mm7 C rh,rl normalized + movq %mm7, %mm2 C rl in low half + psrlq $32, %mm7 C rh + movd (%ecx), %mm1 C bi + pmuludq %mm7, %mm1 C qh,ql + paddq %mm2, %mm1 C qh-1,ql + movd %mm1, %ecx C ql + psrlq $32, %mm1 C qh-1 + movd 16(%esp), %mm3 C b + pmuludq %mm1, %mm3 C (qh-1) * b + psubq %mm3, %mm2 C r in low half (could use psubd) + movd %mm2, %eax C r + mov 16(%esp), %ebx + sub %ebx, %eax C r + cmp %eax, %ecx + lea (%eax,%ebx), %edx + cmovc( %edx, %eax) + movd %mm4, %ecx C cnt + cmp %ebx, %eax + jae L(fix) + emms + pop %ebx + shr %cl, %eax + ret + +L(fix): sub %ebx, %eax + emms + pop %ebx + shr %cl, %eax + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p_cps) +C CAUTION: This is the same code as in k7/mod_1_4.asm + push %ebp + push %edi + push %esi + push %ebx + mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx + mov 24(%esp), %ebx + bsr %ebx, %ecx + xor $31, %ecx + sal %cl, %ebx C b << cnt + mov %ebx, %edx + not %edx + mov $-1, %eax + div %ebx + xor %edi, %edi + sub %ebx, %edi + mov $1, %esi + mov %eax, (%ebp) C store bi + mov %ecx, 4(%ebp) C store cnt + shld %cl, %eax, %esi + imul %edi, %esi + mov %eax, %edi + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 8(%ebp) C store B1modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 12(%ebp) C store B2modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 16(%ebp) C store B3modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 20(%ebp) C store B4modb + + not %edx + imul %ebx, %edx + add %edx, %ebx + cmp %edx, %eax + cmovnc( %edx, %ebx) + + shr %cl, %ebx + mov %ebx, 24(%ebp) C store B5modb + + pop %ebx + pop %esi + pop %edi + pop %ebp + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm new file mode 100644 index 0000000..31e25b7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm @@ -0,0 +1,175 @@ +dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2000-2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C Pentium4: 1.0 cycles/limb + + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) +C +C Enhancements: +C +C There might a couple of cycles to save by using plain integer code for +C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to +C about 46 (inclusive of some function call overheads). + +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +dnl re-use parameter space +define(SAVE_EBX, `PARAM_SRC') +define(SAVE_ESI, `PARAM_SIZE') + + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_34lsub1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %edx + movl (%edx), %eax + + subl $2, %ecx + ja L(three_or_more) + jne L(one) + + movl 4(%edx), %edx + movl %eax, %ecx + shrl $24, %eax C src[0] high + + andl $0x00FFFFFF, %ecx C src[0] low + addl %ecx, %eax + + movl %edx, %ecx + shll $8, %edx + + shrl $16, %ecx C src[1] low + addl %ecx, %eax + + andl $0x00FFFF00, %edx C src[1] high + addl %edx, %eax + +L(one): + ret + + +L(three_or_more): + pxor %mm0, %mm0 + pxor %mm1, %mm1 + pxor %mm2, %mm2 + + pcmpeqd %mm7, %mm7 + psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits + + pcmpeqd %mm6, %mm6 + psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits + +L(top): + C eax + C ebx + C ecx counter, size-2 to 0, -1 or -2 + C edx src, incrementing + C + C mm0 sum 0mod3 + C mm1 sum 1mod3 + C mm2 sum 2mod3 + C mm3 + C mm4 + C mm5 + C mm6 0x0000000000FFFFFF + C mm7 0x00000000FFFFFFFF + + movd (%edx), %mm3 + paddq %mm3, %mm0 + + movd 4(%edx), %mm3 + paddq %mm3, %mm1 + + movd 8(%edx), %mm3 + paddq %mm3, %mm2 + + addl $12, %edx + subl $3, %ecx + ja L(top) + + + C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively + + addl $1, %ecx + js L(combine) C 0 more + + movd (%edx), %mm3 + paddq %mm3, %mm0 + + jz L(combine) C 1 more + + movd 4(%edx), %mm3 + paddq %mm3, %mm1 + +L(combine): + movq %mm7, %mm3 C low halves + pand %mm0, %mm3 + + movq %mm7, %mm4 + pand %mm1, %mm4 + + movq %mm7, %mm5 + pand %mm2, %mm5 + + psrlq $32, %mm0 C high halves + psrlq $32, %mm1 + psrlq $32, %mm2 + + paddq %mm0, %mm4 C fold high halves to give 33 bits each + paddq %mm1, %mm5 + paddq %mm2, %mm3 + + psllq $8, %mm4 C combine at respective offsets + psllq $16, %mm5 + paddq %mm4, %mm3 + paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits + + pand %mm3, %mm6 C fold at 24 bits + psrlq $24, %mm3 + + paddq %mm6, %mm3 + movd %mm3, %eax + + ASSERT(z, C nothing left in high dword + `psrlq $32, %mm3 + movd %mm3, %ecx + orl %ecx, %ecx') + + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm new file mode 100644 index 0000000..aa9ef31 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm @@ -0,0 +1,175 @@ +dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder. + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 19.0 cycles/limb + + +C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + + TEXT + + ALIGN(16) +PROLOGUE(mpn_modexact_1c_odd) +deflit(`FRAME',0) + + movd PARAM_CARRY, %mm1 + jmp L(start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_modexact_1_odd) +deflit(`FRAME',0) + + pxor %mm1, %mm1 C carry limb +L(start_1c): + movl PARAM_DIVISOR, %eax + + movd PARAM_DIVISOR, %mm7 + + shrl %eax + + andl $127, %eax C d/2, 7 bits + +ifdef(`PIC',` + LEA( binvert_limb_table, %edx) + movzbl (%eax,%edx), %eax C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %eax C inv 8 bits +') + + C + + movd %eax, %mm6 C inv + + movd %eax, %mm0 C inv + + pmuludq %mm6, %mm6 C inv*inv + + C + + pmuludq %mm7, %mm6 C inv*inv*d + paddd %mm0, %mm0 C 2*inv + + C + + psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d + pxor %mm6, %mm6 + + paddd %mm0, %mm6 + pmuludq %mm0, %mm0 C inv*inv + + C + + pmuludq %mm7, %mm0 C inv*inv*d + paddd %mm6, %mm6 C 2*inv + + + movl PARAM_SRC, %eax + movl PARAM_SIZE, %ecx + + C + + psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d + + ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS + pushl %eax FRAME_pushl() + movd %mm6, %eax + imul PARAM_DIVISOR, %eax + cmpl $1, %eax + popl %eax FRAME_popl()') + + pxor %mm0, %mm0 C carry bit + + +C The dependent chain here is as follows. +C +C latency +C psubq s = (src-cbit) - climb 2 +C pmuludq q = s*inverse 8 +C pmuludq prod = q*divisor 8 +C psrlq climb = high(prod) 2 +C -- +C 20 +C +C Yet the loop measures 19.0 c/l, so obviously there's something gained +C there over a straight reading of the chip documentation. + +L(top): + C eax src, incrementing + C ebx + C ecx counter, limbs + C edx + C + C mm0 carry bit + C mm1 carry limb + C mm6 inverse + C mm7 divisor + + movd (%eax), %mm2 + addl $4, %eax + + psubq %mm0, %mm2 C src - cbit + + psubq %mm1, %mm2 C src - cbit - climb + movq %mm2, %mm0 + psrlq $63, %mm0 C new cbit + + pmuludq %mm6, %mm2 C s*inverse + + movq %mm7, %mm1 + pmuludq %mm2, %mm1 C q*divisor + psrlq $32, %mm1 C new climb + + subl $1, %ecx + jnz L(top) + + +L(done): + paddq %mm1, %mm0 + movd %mm0, %eax + emms + ret + +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm new file mode 100644 index 0000000..6347b8b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm @@ -0,0 +1,164 @@ +dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) 4.17 +C P6 model 13 (Dothan) 4.17 +C P4 model 0-1 (Willamette) 4 +C P4 model 2 (Northwood) 4 +C P4 model 3-4 (Prescott) 4.55 + +C TODO: +C * Tweak eax/edx offsets in loop as to save some lea's +C * Perhaps software pipeline small-case code + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C n sp + 12 +C v0 sp + 16 + + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1) + pxor %mm6, %mm6 +L(ent): mov 4(%esp), %edx + mov 8(%esp), %eax + mov 12(%esp), %ecx + movd 16(%esp), %mm7 + cmp $4, %ecx + jnc L(big) + +L(lp0): movd (%eax), %mm0 + lea 4(%eax), %eax + lea 4(%edx), %edx + pmuludq %mm7, %mm0 + paddq %mm0, %mm6 + movd %mm6, -4(%edx) + psrlq $32, %mm6 + dec %ecx + jnz L(lp0) + movd %mm6, %eax + emms + ret + +L(big): and $3, %ecx + je L(0) + cmp $2, %ecx + jc L(1) + je L(2) + jmp L(3) C FIXME: one case should fall through + +L(0): movd (%eax), %mm3 + sub 12(%esp), %ecx C loop count + lea -16(%eax), %eax + lea -12(%edx), %edx + pmuludq %mm7, %mm3 + movd 20(%eax), %mm0 + pmuludq %mm7, %mm0 + movd 24(%eax), %mm1 + jmp L(00) + +L(1): movd (%eax), %mm2 + sub 12(%esp), %ecx + lea -12(%eax), %eax + lea -8(%edx), %edx + pmuludq %mm7, %mm2 + movd 16(%eax), %mm3 + pmuludq %mm7, %mm3 + movd 20(%eax), %mm0 + jmp L(01) + +L(2): movd (%eax), %mm1 + sub 12(%esp), %ecx + lea -8(%eax), %eax + lea -4(%edx), %edx + pmuludq %mm7, %mm1 + movd 12(%eax), %mm2 + pmuludq %mm7, %mm2 + movd 16(%eax), %mm3 + jmp L(10) + +L(3): movd (%eax), %mm0 + sub 12(%esp), %ecx + lea -4(%eax), %eax + pmuludq %mm7, %mm0 + movd 8(%eax), %mm1 + pmuludq %mm7, %mm1 + movd 12(%eax), %mm2 + + ALIGN(16) +L(top): pmuludq %mm7, %mm2 + paddq %mm0, %mm6 + movd 16(%eax), %mm3 + movd %mm6, 0(%edx) + psrlq $32, %mm6 +L(10): pmuludq %mm7, %mm3 + paddq %mm1, %mm6 + movd 20(%eax), %mm0 + movd %mm6, 4(%edx) + psrlq $32, %mm6 +L(01): pmuludq %mm7, %mm0 + paddq %mm2, %mm6 + movd 24(%eax), %mm1 + movd %mm6, 8(%edx) + psrlq $32, %mm6 +L(00): pmuludq %mm7, %mm1 + paddq %mm3, %mm6 + movd 28(%eax), %mm2 + movd %mm6, 12(%edx) + psrlq $32, %mm6 + lea 16(%eax), %eax + lea 16(%edx), %edx + add $4, %ecx + ja L(top) + +L(end): pmuludq %mm7, %mm2 + paddq %mm0, %mm6 + movd %mm6, 0(%edx) + psrlq $32, %mm6 + paddq %mm1, %mm6 + movd %mm6, 4(%edx) + psrlq $32, %mm6 + paddq %mm2, %mm6 + movd %mm6, 8(%edx) + psrlq $32, %mm6 + movd %mm6, %eax + emms + ret +EPILOGUE() +PROLOGUE(mpn_mul_1c) + movd 20(%esp), %mm6 + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm new file mode 100644 index 0000000..6e3775a --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm @@ -0,0 +1,662 @@ +dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Improve ad-hoc outer loop code and register handling. Some feed-in +C scheduling could improve things by several cycles per outer iteration. +C * In code for un <= 3, try keeping accumulation operands in registers, +C without storing intermediates to rp. +C * We might want to keep 32 in a free mm register, since the register form is +C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save. +C * Look into different loop alignment, we now expand the code about 50 bytes +C with possibly needless alignment. +C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry. +C * Use OSP, should solve feed-in latency problems. +C * Save a few tens of bytes by doing cross-jumping for Loel0, etc. +C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers +C so that they can share feed-in code, and changing the branch targets from +C L to Lm. + +C cycles/limb +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 5.24 +C P6 model 14 (Yonah) ? +C P4 model 0-1 (Willamette): 5 +C P4 model 2 (Northwood): 4.60 at 32 limbs +C P4 model 3-4 (Prescott): 4.94 at 32 limbs + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C un sp + 12 +C vp sp + 16 +C vn sp + 20 + + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + push %esi + push %ebx + mov 12(%esp), %edx C rp + mov 16(%esp), %eax C up + mov 20(%esp), %ecx C un + mov 24(%esp), %esi C vp + mov 28(%esp), %ebx C vn + movd (%esi), %mm7 C +L(ent): cmp $3, %ecx + ja L(big) + movd (%eax), %mm6 + pmuludq %mm7, %mm6 + jz L(un3) + cmp $2, %ecx + jz L(un2) + +L(un1): movd %mm6, (%edx) C un=1 + psrlq $32, %mm6 C un=1 + movd %mm6, 4(%edx) C un=1 + jmp L(rtr) C un=1 + +L(un2): movd 4(%eax), %mm1 C un=2 + pmuludq %mm7, %mm1 C un=2 + movd %mm6, (%edx) C un=2 + psrlq $32, %mm6 C un=2 + paddq %mm1, %mm6 C un=2 + movd %mm6, 4(%edx) C un=2 + psrlq $32, %mm6 C un=2 + movd %mm6, 8(%edx) C un=2 + dec %ebx C un=2 + jz L(rtr) C un=2 + movd 4(%esi), %mm7 C un=2 + movd (%eax), %mm6 C un=2 + pmuludq %mm7, %mm6 C un=2 + movd 4(%eax), %mm1 C un=2 + movd 4(%edx), %mm4 C un=2 + pmuludq %mm7, %mm1 C un=2 + movd 8(%edx), %mm5 C un=2 + paddq %mm4, %mm6 C un=2 + paddq %mm1, %mm5 C un=2 + movd %mm6, 4(%edx) C un=2 + psrlq $32, %mm6 C un=2 + paddq %mm5, %mm6 C un=2 + movd %mm6, 8(%edx) C un=2 + psrlq $32, %mm6 C un=2 + movd %mm6, 12(%edx) C un=2 +L(rtr): emms + pop %ebx + pop %esi + ret + +L(un3): movd 4(%eax), %mm1 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + pmuludq %mm7, %mm2 C un=3 + movd %mm6, (%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm1, %mm6 C un=3 + movd %mm6, 4(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm2, %mm6 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 12(%edx) C un=3 + dec %ebx C un=3 + jz L(rtr) C un=3 + movd 4(%esi), %mm7 C un=3 + movd (%eax), %mm6 C un=3 + pmuludq %mm7, %mm6 C un=3 + movd 4(%eax), %mm1 C un=3 + movd 4(%edx), %mm4 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + movd 8(%edx), %mm5 C un=3 + pmuludq %mm7, %mm2 C un=3 + paddq %mm4, %mm6 C un=3 + paddq %mm1, %mm5 C un=3 + movd 12(%edx), %mm4 C un=3 + movd %mm6, 4(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm5, %mm6 C un=3 + paddq %mm2, %mm4 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm4, %mm6 C un=3 + movd %mm6, 12(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 16(%edx) C un=3 + dec %ebx C un=3 + jz L(rtr) C un=3 + movd 8(%esi), %mm7 C un=3 + movd (%eax), %mm6 C un=3 + pmuludq %mm7, %mm6 C un=3 + movd 4(%eax), %mm1 C un=3 + movd 8(%edx), %mm4 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + movd 12(%edx), %mm5 C un=3 + pmuludq %mm7, %mm2 C un=3 + paddq %mm4, %mm6 C un=3 + paddq %mm1, %mm5 C un=3 + movd 16(%edx), %mm4 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm5, %mm6 C un=3 + paddq %mm2, %mm4 C un=3 + movd %mm6, 12(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm4, %mm6 C un=3 + movd %mm6, 16(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 20(%edx) C un=3 + jmp L(rtr) + + +L(big): push %edi + pxor %mm6, %mm6 + lea 4(%esi), %esi + and $3, %ecx + jz L(0) + cmp $2, %ecx + jc L(1) + jz L(2) + jmp L(3) C FIXME: one case should fall through + + +L(0): movd (%eax), %mm3 C m 0 + sub 24(%esp), %ecx C inner loop count m 0 + mov %ecx, 24(%esp) C update loop count for later m 0 + pmuludq %mm7, %mm3 C m 0 + movd 4(%eax), %mm0 C m 0 + pmuludq %mm7, %mm0 C m 0 + movd 8(%eax), %mm1 C m 0 + jmp L(m00) C m 0 + ALIGN(16) C m 0 +L(lpm0): + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd (%eax), %mm3 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm3 C m 0 + paddq %mm1, %mm6 C m 0 + movd 4(%eax), %mm0 C m 0 + movd %mm6, -8(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm0 C m 0 + paddq %mm4, %mm6 C m 0 + movd 8(%eax), %mm1 C m 0 + movd %mm6, -4(%edx) C m 0 + psrlq $32, %mm6 C m 0 +L(m00): pmuludq %mm7, %mm1 C m 0 + paddq %mm3, %mm6 C m 0 + movd 12(%eax), %mm4 C m 0 + movd %mm6, (%edx) C m 0 + psrlq $32, %mm6 C m 0 + lea 16(%eax), %eax C m 0 + lea 16(%edx), %edx C m 0 + add $4, %ecx C m 0 + ja L(lpm0) C m 0 + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + paddq %mm1, %mm6 C m 0 + mov 16(%esp), %edi C rp 0 + jmp L(x0) + +L(olp0): + lea 4(%edi), %edi C am 0 + movd (%esi), %mm7 C am 0 + lea 4(%esi), %esi C am 0 + mov %edi, %edx C rp am 0 + mov 20(%esp), %eax C up am 0 + movd (%eax), %mm3 C am 0 + mov 24(%esp), %ecx C inner loop count am 0 + pxor %mm6, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + movd 4(%eax), %mm0 C am 0 + movd (%edx), %mm5 C am 0 + pmuludq %mm7, %mm0 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + jmp L(am00) C am 0 + ALIGN(16) C mm 0 +L(lam0): + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + movd (%eax), %mm3 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + paddq %mm5, %mm6 C am 0 + movd 4(%eax), %mm0 C am 0 + paddq %mm2, %mm4 C am 0 + movd (%edx), %mm5 C am 0 + movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm0 C am 0 + paddq %mm4, %mm6 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 +L(am00): + pmuludq %mm7, %mm1 C am 0 + paddq %mm5, %mm6 C am 0 + movd 12(%eax), %mm2 C am 0 + paddq %mm0, %mm4 C am 0 + movd 8(%edx), %mm5 C am 0 + movd %mm6, (%edx) C am 0 + psrlq $32, %mm6 C am 0 + lea 16(%eax), %eax C am 0 + lea 16(%edx), %edx C am 0 + add $4, %ecx C am 0 + jnz L(lam0) C am 0 + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm5, %mm6 C am 0 + paddq %mm2, %mm4 C am 0 +L(x0): movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm4, %mm6 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 + movd %mm6, (%edx) C am 0 + dec %ebx C am 0 + jnz L(olp0) C am 0 +L(oel0): + emms C 0 + pop %edi C 0 + pop %ebx C 0 + pop %esi C 0 + ret C 0 + + +L(1): movd (%eax), %mm4 C m 1 + sub 24(%esp), %ecx C m 1 + mov %ecx, 24(%esp) C update loop count for later m 1 + pmuludq %mm7, %mm4 C m 1 + movd 4(%eax), %mm3 C m 1 + pmuludq %mm7, %mm3 C m 1 + movd 8(%eax), %mm0 C m 1 + jmp L(m01) C m 1 + ALIGN(16) C m 1 +L(lpm1): + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd 4(%eax), %mm3 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm3 C m 1 + paddq %mm1, %mm6 C m 1 + movd 8(%eax), %mm0 C m 1 + movd %mm6, -4(%edx) C m 1 + psrlq $32, %mm6 C m 1 +L(m01): pmuludq %mm7, %mm0 C m 1 + paddq %mm4, %mm6 C m 1 + movd 12(%eax), %mm1 C m 1 + movd %mm6, (%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm1 C m 1 + paddq %mm3, %mm6 C m 1 + movd 16(%eax), %mm4 C m 1 + movd %mm6, 4(%edx) C m 1 + psrlq $32, %mm6 C m 1 + lea 16(%eax), %eax C m 1 + lea 16(%edx), %edx C m 1 + add $4, %ecx C m 1 + ja L(lpm1) C m 1 + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + paddq %mm1, %mm6 C m 1 + mov 16(%esp), %edi C rp 1 + jmp L(x1) + +L(olp1): + lea 4(%edi), %edi C am 1 + movd (%esi), %mm7 C am 1 + lea 4(%esi), %esi C am 1 + mov %edi, %edx C rp am 1 + mov 20(%esp), %eax C up am 1 + movd (%eax), %mm2 C am 1 + mov 24(%esp), %ecx C inner loop count am 1 + pxor %mm6, %mm6 C am 1 + pmuludq %mm7, %mm2 C am 1 + movd 4(%eax), %mm3 C am 1 + movd (%edx), %mm4 C am 1 + pmuludq %mm7, %mm3 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + jmp L(am01) C am 1 + ALIGN(16) C am 1 +L(lam1): + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + movd 4(%eax), %mm3 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm3 C am 1 + paddq %mm5, %mm6 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 +L(am01): + pmuludq %mm7, %mm0 C am 1 + paddq %mm4, %mm6 C am 1 + movd 12(%eax), %mm1 C am 1 + paddq %mm3, %mm5 C am 1 + movd 8(%edx), %mm4 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm1 C am 1 + paddq %mm5, %mm6 C am 1 + movd 16(%eax), %mm2 C am 1 + paddq %mm0, %mm4 C am 1 + movd 12(%edx), %mm5 C am 1 + movd %mm6, 4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + lea 16(%eax), %eax C am 1 + lea 16(%edx), %edx C am 1 + add $4, %ecx C am 1 + jnz L(lam1) C am 1 + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm5, %mm6 C am 1 + paddq %mm2, %mm4 C am 1 +L(x1): movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm4, %mm6 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + movd %mm6, 4(%edx) C am 1 + dec %ebx C am 1 + jnz L(olp1) C am 1 +L(oel1): + emms C 1 + pop %edi C 1 + pop %ebx C 1 + pop %esi C 1 + ret C 1 + + +L(2): movd (%eax), %mm1 C m 2 + sub 24(%esp), %ecx C m 2 + mov %ecx, 24(%esp) C update loop count for later m 2 + pmuludq %mm7, %mm1 C m 2 + movd 4(%eax), %mm4 C m 2 + pmuludq %mm7, %mm4 C m 2 + movd 8(%eax), %mm3 C m 2 + jmp L(m10) C m 2 + ALIGN(16) C m 2 +L(lpm2): + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd 8(%eax), %mm3 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 +L(m10): pmuludq %mm7, %mm3 C m 2 + paddq %mm1, %mm6 C m 2 + movd 12(%eax), %mm0 C m 2 + movd %mm6, (%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm0 C m 2 + paddq %mm4, %mm6 C m 2 + movd 16(%eax), %mm1 C m 2 + movd %mm6, 4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm1 C m 2 + paddq %mm3, %mm6 C m 2 + movd 20(%eax), %mm4 C m 2 + movd %mm6, 8(%edx) C m 2 + psrlq $32, %mm6 C m 2 + lea 16(%eax), %eax C m 2 + lea 16(%edx), %edx C m 2 + add $4, %ecx C m 2 + ja L(lpm2) C m 2 + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + paddq %mm1, %mm6 C m 2 + mov 16(%esp), %edi C rp 2 + jmp L(x2) + +L(olp2): + lea 4(%edi), %edi C am 2 + movd (%esi), %mm7 C am 2 + lea 4(%esi), %esi C am 2 + mov %edi, %edx C rp am 2 + mov 20(%esp), %eax C up am 2 + movd (%eax), %mm1 C am 2 + mov 24(%esp), %ecx C inner loop count am 2 + pxor %mm6, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + movd 4(%eax), %mm2 C am 2 + movd (%edx), %mm5 C am 2 + pmuludq %mm7, %mm2 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + jmp L(am10) C am 2 + ALIGN(16) C am 2 +L(lam2): + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 +L(am10): + pmuludq %mm7, %mm3 C am 2 + paddq %mm5, %mm6 C am 2 + movd 12(%eax), %mm0 C am 2 + paddq %mm2, %mm4 C am 2 + movd 8(%edx), %mm5 C am 2 + movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm0 C am 2 + paddq %mm4, %mm6 C am 2 + movd 16(%eax), %mm1 C am 2 + paddq %mm3, %mm5 C am 2 + movd 12(%edx), %mm4 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + paddq %mm5, %mm6 C am 2 + movd 20(%eax), %mm2 C am 2 + paddq %mm0, %mm4 C am 2 + movd 16(%edx), %mm5 C am 2 + movd %mm6, 8(%edx) C am 2 + psrlq $32, %mm6 C am 2 + lea 16(%eax), %eax C am 2 + lea 16(%edx), %edx C am 2 + add $4, %ecx C am 2 + jnz L(lam2) C am 2 + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm5, %mm6 C am 2 + paddq %mm2, %mm4 C am 2 +L(x2): movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm4, %mm6 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + movd %mm6, 8(%edx) C am 2 + dec %ebx C am 2 + jnz L(olp2) C am 2 +L(oel2): + emms C 2 + pop %edi C 2 + pop %ebx C 2 + pop %esi C 2 + ret C 2 + + +L(3): movd (%eax), %mm0 C m 3 + sub 24(%esp), %ecx C m 3 + mov %ecx, 24(%esp) C update loop count for later m 3 + pmuludq %mm7, %mm0 C m 3 + movd 4(%eax), %mm1 C m 3 + pmuludq %mm7, %mm1 C m 3 + movd 8(%eax), %mm4 C m 3 + jmp L(lpm3) C m 3 + ALIGN(16) C m 3 +L(lpm3): + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd 12(%eax), %mm3 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm3 C m 3 + paddq %mm1, %mm6 C m 3 + movd 16(%eax), %mm0 C m 3 + movd %mm6, 4(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm0 C m 3 + paddq %mm4, %mm6 C m 3 + movd 20(%eax), %mm1 C m 3 + movd %mm6, 8(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm1 C m 3 + paddq %mm3, %mm6 C m 3 + movd 24(%eax), %mm4 C m 3 + movd %mm6, 12(%edx) C m 3 + psrlq $32, %mm6 C m 3 + lea 16(%eax), %eax C m 3 + lea 16(%edx), %edx C m 3 + add $4, %ecx C m 3 + ja L(lpm3) C m 3 + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + paddq %mm1, %mm6 C m 3 + mov 16(%esp), %edi C rp 3 + jmp L(x3) + +L(olp3): + lea 4(%edi), %edi C am 3 + movd (%esi), %mm7 C am 3 + lea 4(%esi), %esi C am 3 + mov %edi, %edx C rp am 3 + mov 20(%esp), %eax C up am 3 + movd (%eax), %mm0 C am 3 + mov 24(%esp), %ecx C inner loop count am 3 + pxor %mm6, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + movd 4(%eax), %mm1 C am 3 + movd (%edx), %mm4 C am 3 + pmuludq %mm7, %mm1 C am 3 + movd 8(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 4(%edx), %mm5 C am 3 + jmp L(lam3) C am 3 + ALIGN(16) C am 3 +L(lam3): + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + movd 12(%eax), %mm3 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm3 C am 3 + paddq %mm5, %mm6 C am 3 + movd 16(%eax), %mm0 C am 3 + paddq %mm2, %mm4 C am 3 + movd 12(%edx), %mm5 C am 3 + movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + paddq %mm4, %mm6 C am 3 + movd 20(%eax), %mm1 C am 3 + paddq %mm3, %mm5 C am 3 + movd 16(%edx), %mm4 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm1 C am 3 + paddq %mm5, %mm6 C am 3 + movd 24(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 20(%edx), %mm5 C am 3 + movd %mm6, 12(%edx) C am 3 + psrlq $32, %mm6 C am 3 + lea 16(%eax), %eax C am 3 + lea 16(%edx), %edx C am 3 + add $4, %ecx C am 3 + jnz L(lam3) C am 3 + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm5, %mm6 C am 3 + paddq %mm2, %mm4 C am 3 +L(x3): movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm4, %mm6 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + movd %mm6, 12(%edx) C am 3 + dec %ebx C am 3 + jnz L(olp3) C am 3 +L(oel3): + emms C 3 + pop %edi C 3 + pop %ebx C 3 + pop %esi C 3 + ret C 3 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm new file mode 100644 index 0000000..c7f4426 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm @@ -0,0 +1,281 @@ +dnl X86-32 and X86-64 mpn_popcount using SSE2. + +dnl Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C 32-bit popcount hamdist +C cycles/limb cycles/limb +C P5 - +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4 +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) 3.9 +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 - +C AMD K7 - +C AMD K8 ? + +C 64-bit popcount hamdist +C cycles/limb cycles/limb +C P4 model 4 (Nocona): 8 +C AMD K8,K9 7.5 +C AMD K10 3.5 +C Intel core2 3.68 +C Intel corei 3.15 +C Intel atom 10.8 +C VIA nano 6.5 + +C TODO +C * Make an mpn_hamdist based on this. Alignment could either be handled by +C using movdqu for one operand and movdqa for the other, or by painfully +C shifting as we go. Unfortunately, there seem to be no usable shift +C instruction, except for one that takes an immediate count. +C * It would probably be possible to cut a few cycles/limb using software +C pipelining. +C * There are 35 decode slots unused by the SSE2 instructions. Loop control +C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel +C integer based popcount. Such a combined loop would handle 6 limbs in +C about 30 cycles on K8. +C * We could save a byte or two by using 32-bit operations on areg. +C * Check if using movdqa to a temp of and then register-based pand is faster. + +ifelse(GMP_LIMB_BITS,`32', +` define(`up', `%edx') + define(`n', `%ecx') + define(`areg',`%eax') + define(`breg',`%ebx') + define(`zero',`%xmm4') + define(`LIMB32',` $1') + define(`LIMB64',`dnl') +',` + define(`up', `%rdi') + define(`n', `%rsi') + define(`areg',`%rax') + define(`breg',`%rdx') + define(`zero',`%xmm8') + define(`LIMB32',`dnl') + define(`LIMB64',` $1') +') + +define(`mm01010101',`%xmm6') +define(`mm00110011',`%xmm7') +define(`mm00001111',`%xmm2') + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) + +undefine(`psadbw') C override inherited m4 version + +C This file is shared between 32-bit and 64-bit builds. Only the former has +C LEAL. Default LEAL as an alias of LEA. +ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')') + +ASM_START() + +C Make cnsts global to work around Apple relocation bug. +ifdef(`DARWIN',` + define(`cnsts', MPN(popccnsts)) + GLOBL cnsts') + + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + +LIMB32(`mov 4(%esp), up ') +LIMB32(`mov 8(%esp), n ') +LIMB32(`push %ebx ') + + pxor %xmm3, %xmm3 C zero grand total count +LIMB64(`pxor zero, zero ') + + LEAL( cnsts, breg) + + movdqa -48(breg), mm01010101 + movdqa -32(breg), mm00110011 + movdqa -16(breg), mm00001111 + + mov up, areg + and $-16, up C round `up' down to 128-bit boundary + and $12, areg C 32:areg = 0, 4, 8, 12 + C 64:areg = 0, 8 + movdqa (up), %xmm0 + pand 64(breg,areg,4), %xmm0 + shr $m4_log2(GMP_LIMB_BYTES), %eax + add areg, n C compensate n for rounded down `up' + + pxor %xmm4, %xmm4 + sub $LIMBS_PER_XMM, n + jbe L(sum) + + sub $LIMBS_PER_XMM, n + ja L(ent) + jmp L(lsum) + + ALIGN(16) +L(top): movdqa (up), %xmm0 +L(ent): movdqa 16(up), %xmm4 + + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + psrld $1, %xmm0 + psrld $1, %xmm4 + pand mm01010101, %xmm0 + pand mm01010101, %xmm4 + psubd %xmm0, %xmm1 + psubd %xmm4, %xmm5 + + movdqa %xmm1, %xmm0 + movdqa %xmm5, %xmm4 + psrlq $2, %xmm1 + psrlq $2, %xmm5 + pand mm00110011, %xmm0 + pand mm00110011, %xmm4 + pand mm00110011, %xmm1 + pand mm00110011, %xmm5 + paddq %xmm0, %xmm1 + paddq %xmm4, %xmm5 + +LIMB32(`pxor zero, zero ') + + add $32, up + sub $LIMBS_PER_2XMM, n + + paddq %xmm5, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $4, %xmm1 + pand mm00001111, %xmm0 + pand mm00001111, %xmm1 + paddq %xmm0, %xmm1 + + psadbw zero, %xmm1 + paddq %xmm1, %xmm3 C add to grand total + + jnc L(top) +L(end): + add $LIMBS_PER_2XMM, n + jz L(rt) + movdqa (up), %xmm0 + pxor %xmm4, %xmm4 + sub $LIMBS_PER_XMM, n + jbe L(sum) +L(lsum): + movdqa %xmm0, %xmm4 + movdqa 16(up), %xmm0 +L(sum): + shl $m4_log2(GMP_LIMB_BYTES), n + and $12, n + pand (breg,n,4), %xmm0 + + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + psrld $1, %xmm0 + psrld $1, %xmm4 + pand mm01010101, %xmm0 + pand mm01010101, %xmm4 + psubd %xmm0, %xmm1 + psubd %xmm4, %xmm5 + + movdqa %xmm1, %xmm0 + movdqa %xmm5, %xmm4 + psrlq $2, %xmm1 + psrlq $2, %xmm5 + pand mm00110011, %xmm0 + pand mm00110011, %xmm4 + pand mm00110011, %xmm1 + pand mm00110011, %xmm5 + paddq %xmm0, %xmm1 + paddq %xmm4, %xmm5 + +LIMB32(`pxor zero, zero ') + + paddq %xmm5, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $4, %xmm1 + pand mm00001111, %xmm0 + pand mm00001111, %xmm1 + paddq %xmm0, %xmm1 + + psadbw zero, %xmm1 + paddq %xmm1, %xmm3 C add to grand total + + +C Add the two 64-bit halves of the grand total counter +L(rt): movdqa %xmm3, %xmm0 + psrldq $8, %xmm3 + paddq %xmm3, %xmm0 + movd %xmm0, areg C movq avoided due to gas bug + +LIMB32(`pop %ebx ') + ret + +EPILOGUE() +DEF_OBJECT(dummy,16) +C Three magic constants used for masking out bits + .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + + .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 + .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 + + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +cnsts: +C Masks for high end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(dummy) +ASM_END() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm new file mode 100644 index 0000000..f421d13 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm @@ -0,0 +1,126 @@ +dnl Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2 + +dnl Copyright 2001-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb (approx) +C dst!=src1,2 dst==src1 dst==src2 +C P4: 4.5 6.5 6.5 + + +C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, +C mp_size_t size); +C +C The slightly strange combination of indexing and pointer incrementing +C that's used seems to work best. Not sure why, but for instance leal +C incrementing on %esi is a 1 or 2 cycle slowdown. +C +C The dependent chain is paddq combining the carry and next (shifted) part, +C plus psrlq to move the new carry down. That, and just 4 mmx instructions +C in total, makes 4 c/l the target speed, which is almost achieved for +C separate src/dst but when src==dst the write combining anomalies slow it +C down. + +defframe(PARAM_SIZE, 16) +defframe(PARAM_YP, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_XP') +define(SAVE_ESI,`PARAM_YP') + + TEXT + ALIGN(8) + +PROLOGUE(mpn_rsh1add_n) +deflit(`FRAME',0) + + movl PARAM_XP, %edx + movl %ebx, SAVE_EBX + + movl PARAM_YP, %ebx + movl %esi, SAVE_ESI + + movl PARAM_WP, %esi + + movd (%edx), %mm0 C xp[0] + + movd (%ebx), %mm1 C yp[0] + movl PARAM_SIZE, %ecx + + movl (%edx), %eax C xp[0] + + addl (%ebx), %eax C xp[0]+yp[0] + + paddq %mm1, %mm0 C xp[0]+yp[0] + leal (%esi,%ecx,4), %esi C wp end + negl %ecx C -size + + psrlq $1, %mm0 C (xp[0]+yp[0])/2 + and $1, %eax C return value, rsh1 bit of xp[0]+yp[0] + addl $1, %ecx C -(size-1) + jz L(done) + + +L(top): + C eax return value + C ebx yp end + C ecx counter, limbs, -(size-1) to -1 inclusive + C edx xp end + C esi wp end + C mm0 carry (32 bits) + + movd 4(%edx), %mm1 C xp[i+1] + movd 4(%ebx), %mm2 C yp[i+1] + leal 4(%edx), %edx + leal 4(%ebx), %ebx + paddq %mm2, %mm1 C xp[i+1]+yp[i+1] + psllq $31, %mm1 C low bit at 31, further 32 above + + paddq %mm1, %mm0 C 31 and carry from prev add + movd %mm0, -4(%esi,%ecx,4) C low ready to store dst[i] + + psrlq $32, %mm0 C high becomes new carry + + addl $1, %ecx + jnz L(top) + + +L(done): + movd %mm0, -4(%esi) C dst[size-1] + movl SAVE_EBX, %ebx + + movl SAVE_ESI, %esi + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm new file mode 100644 index 0000000..0d548e0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm @@ -0,0 +1,705 @@ +dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Improve ad-hoc outer loop code and register handling. Some feed-in +C scheduling could improve things by several cycles per outer iteration. +C * In Lam3...Lam1 code for, keep accumulation operands in registers, without +C storing intermediates to rp. +C * We might want to keep 32 in a free mm register, since the register form is +C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save. +C * Look into different loop alignment, we now expand the code about 50 bytes +C with possibly needless alignment. +C * Use OSP, should solve feed-in latency problems. +C * Address relative slowness for un<=3 for Pentium M. The old code is there +C considerably faster. (1:20/14, 2:34/32, 3:66/57) + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C un sp + 12 + + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + mov 4(%esp), %edx C rp + mov 8(%esp), %eax C up + mov 12(%esp), %ecx C un + + cmp $2, %ecx + jc L(un1) + jz L(un2) + cmp $4, %ecx + jc L(un3) + jz L(un4) + jmp L(big) + +L(un1): mov (%eax), %eax + mov %edx, %ecx + mul %eax + mov %eax, (%ecx) + mov %edx, 4(%ecx) + ret +L(un2): movd (%eax), %mm0 C un=2 + movd (%eax), %mm2 C un=2 + movd 4(%eax), %mm1 C un=2 + pmuludq %mm0, %mm0 C 64b weight 0 un=2 + pmuludq %mm1, %mm2 C 64b weight 32 un=2 + pmuludq %mm1, %mm1 C 64b weight 64 un=2 + movd %mm0, (%edx) C un=2 + psrlq $32, %mm0 C 32b weight 32 un=2 + pcmpeqd %mm7, %mm7 C un=2 + psrlq $33, %mm7 C 0x000000007FFFFFFF un=2 + pand %mm2, %mm7 C 31b weight 32 un=2 + psrlq $31, %mm2 C 33b weight 65 un=2 + psllq $1, %mm7 C 31b weight 33 un=2 + paddq %mm7, %mm0 C un=2 + movd %mm0, 4(%edx) C un=2 + psrlq $32, %mm0 C un=2 + paddq %mm2, %mm1 C un=2 + paddq %mm0, %mm1 C un=2 + movd %mm1, 8(%edx) C un=2 + psrlq $32, %mm1 C un=2 + movd %mm1, 12(%edx) C un=2 + emms + ret +L(un3): movd (%eax), %mm7 C un=3 + movd 4(%eax), %mm6 C un=3 + pmuludq %mm7, %mm6 C un=3 + movd 8(%eax), %mm2 C un=3 + pmuludq %mm7, %mm2 C un=3 + movd %mm6, 4(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm2, %mm6 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 12(%edx) C un=3 + lea 4(%edx), %edx C un=3 + lea 4(%eax), %eax C un=3 + jmp L(am1) +L(un4): movd (%eax), %mm7 C un=4 + movd 4(%eax), %mm6 C un=4 + pmuludq %mm7, %mm6 C un=4 + movd 8(%eax), %mm0 C un=4 + pmuludq %mm7, %mm0 C un=4 + movd 12(%eax), %mm1 C un=4 + pmuludq %mm7, %mm1 C un=4 + movd %mm6, 4(%edx) C un=4 + psrlq $32, %mm6 C un=4 + paddq %mm0, %mm6 C un=4 + movd %mm6, 8(%edx) C un=4 + psrlq $32, %mm6 C un=4 + paddq %mm1, %mm6 C un=4 + movd %mm6, 12(%edx) C un=4 + psrlq $32, %mm6 C un=4 + movd %mm6, 16(%edx) C un=4 + lea 4(%edx), %edx C un=4 + lea 4(%eax), %eax C un=4 + jmp L(am2) + +L(big): push %esi + push %ebx + push %edi + pxor %mm6, %mm6 + movd (%eax), %mm7 C + lea 4(%eax), %esi C init up, up++ + lea 4(%eax), %eax C up2++ FIXME: should fix offsets + lea 4(%edx), %edi C init rp, rp++ + lea 4(%edx), %edx C rp2++ + lea -4(%ecx), %ebx C loop count + and $3, %ecx + jz L(3m) + cmp $2, %ecx + ja L(2m) + jb L(0m) + +L(1m): + movd (%eax), %mm4 C m 1 + lea (%ebx), %ecx C inner loop count m 1 + pmuludq %mm7, %mm4 C m 1 + movd 4(%eax), %mm3 C m 1 + pmuludq %mm7, %mm3 C m 1 + movd 8(%eax), %mm0 C m 1 + jmp L(m01) C m 1 + ALIGN(16) C m 1 +L(lpm1): + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd 4(%eax), %mm3 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm3 C m 1 + paddq %mm1, %mm6 C m 1 + movd 8(%eax), %mm0 C m 1 + movd %mm6, -4(%edx) C m 1 + psrlq $32, %mm6 C m 1 +L(m01): pmuludq %mm7, %mm0 C m 1 + paddq %mm4, %mm6 C m 1 + movd 12(%eax), %mm1 C m 1 + movd %mm6, (%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm1 C m 1 + paddq %mm3, %mm6 C m 1 + movd 16(%eax), %mm4 C m 1 + movd %mm6, 4(%edx) C m 1 + psrlq $32, %mm6 C m 1 + lea 16(%eax), %eax C m 1 + lea 16(%edx), %edx C m 1 + sub $4, %ecx C m 1 + ja L(lpm1) C m 1 + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + paddq %mm1, %mm6 C m 1 + jmp L(0) + +L(2m): + movd (%eax), %mm1 C m 2 + lea (%ebx), %ecx C inner loop count m 2 + pmuludq %mm7, %mm1 C m 2 + movd 4(%eax), %mm4 C m 2 + pmuludq %mm7, %mm4 C m 2 + movd 8(%eax), %mm3 C m 2 + jmp L(m10) C m 2 + ALIGN(16) C m 2 +L(lpm2): + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd 8(%eax), %mm3 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 +L(m10): pmuludq %mm7, %mm3 C m 2 + paddq %mm1, %mm6 C m 2 + movd 12(%eax), %mm0 C m 2 + movd %mm6, (%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm0 C m 2 + paddq %mm4, %mm6 C m 2 + movd 16(%eax), %mm1 C m 2 + movd %mm6, 4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm1 C m 2 + paddq %mm3, %mm6 C m 2 + movd 20(%eax), %mm4 C m 2 + movd %mm6, 8(%edx) C m 2 + psrlq $32, %mm6 C m 2 + lea 16(%eax), %eax C m 2 + lea 16(%edx), %edx C m 2 + sub $4, %ecx C m 2 + ja L(lpm2) C m 2 + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + paddq %mm1, %mm6 C m 2 + jmp L(1) + +L(3m): + movd (%eax), %mm0 C m 3 + lea (%ebx), %ecx C inner loop count m 3 + pmuludq %mm7, %mm0 C m 3 + movd 4(%eax), %mm1 C m 3 + pmuludq %mm7, %mm1 C m 3 + movd 8(%eax), %mm4 C m 3 + jmp L(lpm3) C m 3 + ALIGN(16) C m 3 +L(lpm3): + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd 12(%eax), %mm3 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm3 C m 3 + paddq %mm1, %mm6 C m 3 + movd 16(%eax), %mm0 C m 3 + movd %mm6, 4(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm0 C m 3 + paddq %mm4, %mm6 C m 3 + movd 20(%eax), %mm1 C m 3 + movd %mm6, 8(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm1 C m 3 + paddq %mm3, %mm6 C m 3 + movd 24(%eax), %mm4 C m 3 + movd %mm6, 12(%edx) C m 3 + psrlq $32, %mm6 C m 3 + lea 16(%eax), %eax C m 3 + lea 16(%edx), %edx C m 3 + sub $4, %ecx C m 3 + ja L(lpm3) C m 3 + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + paddq %mm1, %mm6 C m 3 + jmp L(2) + +L(0m): + movd (%eax), %mm3 C m 0 + lea (%ebx), %ecx C inner loop count m 0 + pmuludq %mm7, %mm3 C m 0 + movd 4(%eax), %mm0 C m 0 + pmuludq %mm7, %mm0 C m 0 + movd 8(%eax), %mm1 C m 0 + jmp L(m00) C m 0 + ALIGN(16) C m 0 +L(lpm0): + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd (%eax), %mm3 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm3 C m 0 + paddq %mm1, %mm6 C m 0 + movd 4(%eax), %mm0 C m 0 + movd %mm6, -8(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm0 C m 0 + paddq %mm4, %mm6 C m 0 + movd 8(%eax), %mm1 C m 0 + movd %mm6, -4(%edx) C m 0 + psrlq $32, %mm6 C m 0 +L(m00): pmuludq %mm7, %mm1 C m 0 + paddq %mm3, %mm6 C m 0 + movd 12(%eax), %mm4 C m 0 + movd %mm6, (%edx) C m 0 + psrlq $32, %mm6 C m 0 + lea 16(%eax), %eax C m 0 + lea 16(%edx), %edx C m 0 + sub $4, %ecx C m 0 + ja L(lpm0) C m 0 + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + paddq %mm1, %mm6 C m 0 + jmp L(3) + +L(outer): + lea 8(%edi), %edi C rp += 2 + movd (%esi), %mm7 C am 3 + mov %edi, %edx C rp2 = rp am 3 + lea 4(%esi), %esi C up++ am 3 + lea (%esi), %eax C up2 = up am 3 + movd (%eax), %mm0 C am 3 + lea (%ebx), %ecx C inner loop count am 3 + pxor %mm6, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + movd 4(%eax), %mm1 C am 3 + movd (%edx), %mm4 C am 3 + pmuludq %mm7, %mm1 C am 3 + movd 8(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 4(%edx), %mm5 C am 3 + jmp L(lam3) C am 3 + ALIGN(16) C am 3 +L(lam3): + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + movd 12(%eax), %mm3 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm3 C am 3 + paddq %mm5, %mm6 C am 3 + movd 16(%eax), %mm0 C am 3 + paddq %mm2, %mm4 C am 3 + movd 12(%edx), %mm5 C am 3 + movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + paddq %mm4, %mm6 C am 3 + movd 20(%eax), %mm1 C am 3 + paddq %mm3, %mm5 C am 3 + movd 16(%edx), %mm4 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm1 C am 3 + paddq %mm5, %mm6 C am 3 + movd 24(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 20(%edx), %mm5 C am 3 + movd %mm6, 12(%edx) C am 3 + psrlq $32, %mm6 C am 3 + lea 16(%eax), %eax C am 3 + lea 16(%edx), %edx C am 3 + sub $4, %ecx C am 3 + ja L(lam3) C am 3 + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm5, %mm6 C am 3 + paddq %mm2, %mm4 C am 3 +L(2): movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm4, %mm6 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + movd %mm6, 12(%edx) C am 3 + + lea 8(%edi), %edi C rp += 2 + movd (%esi), %mm7 C am 2 + mov %edi, %edx C rp2 = rp am 2 + lea 4(%esi), %esi C up++ am 2 + lea (%esi), %eax C up2 = up am 2 + movd (%eax), %mm1 C am 2 + lea (%ebx), %ecx C inner loop count am 2 + pxor %mm6, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + movd 4(%eax), %mm2 C am 2 + movd (%edx), %mm5 C am 2 + pmuludq %mm7, %mm2 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + jmp L(am10) C am 2 + ALIGN(16) C am 2 +L(lam2): + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 +L(am10): + pmuludq %mm7, %mm3 C am 2 + paddq %mm5, %mm6 C am 2 + movd 12(%eax), %mm0 C am 2 + paddq %mm2, %mm4 C am 2 + movd 8(%edx), %mm5 C am 2 + movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm0 C am 2 + paddq %mm4, %mm6 C am 2 + movd 16(%eax), %mm1 C am 2 + paddq %mm3, %mm5 C am 2 + movd 12(%edx), %mm4 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + paddq %mm5, %mm6 C am 2 + movd 20(%eax), %mm2 C am 2 + paddq %mm0, %mm4 C am 2 + movd 16(%edx), %mm5 C am 2 + movd %mm6, 8(%edx) C am 2 + psrlq $32, %mm6 C am 2 + lea 16(%eax), %eax C am 2 + lea 16(%edx), %edx C am 2 + sub $4, %ecx C am 2 + ja L(lam2) C am 2 + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm5, %mm6 C am 2 + paddq %mm2, %mm4 C am 2 +L(1): movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm4, %mm6 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + movd %mm6, 8(%edx) C am 2 + + lea 8(%edi), %edi C rp += 2 + movd (%esi), %mm7 C am 1 + mov %edi, %edx C rp2 = rp am 1 + lea 4(%esi), %esi C up++ am 1 + lea (%esi), %eax C up2 = up am 1 + movd (%eax), %mm2 C am 1 + lea (%ebx), %ecx C inner loop count am 1 + pxor %mm6, %mm6 C am 1 + pmuludq %mm7, %mm2 C am 1 + movd 4(%eax), %mm3 C am 1 + movd (%edx), %mm4 C am 1 + pmuludq %mm7, %mm3 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + jmp L(am01) C am 1 + ALIGN(16) C am 1 +L(lam1): + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + movd 4(%eax), %mm3 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm3 C am 1 + paddq %mm5, %mm6 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 +L(am01): + pmuludq %mm7, %mm0 C am 1 + paddq %mm4, %mm6 C am 1 + movd 12(%eax), %mm1 C am 1 + paddq %mm3, %mm5 C am 1 + movd 8(%edx), %mm4 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm1 C am 1 + paddq %mm5, %mm6 C am 1 + movd 16(%eax), %mm2 C am 1 + paddq %mm0, %mm4 C am 1 + movd 12(%edx), %mm5 C am 1 + movd %mm6, 4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + lea 16(%eax), %eax C am 1 + lea 16(%edx), %edx C am 1 + sub $4, %ecx C am 1 + ja L(lam1) C am 1 + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm5, %mm6 C am 1 + paddq %mm2, %mm4 C am 1 +L(0): movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm4, %mm6 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + movd %mm6, 4(%edx) C am 1 + + lea 8(%edi), %edi C rp += 2 + movd (%esi), %mm7 C am 0 + mov %edi, %edx C rp2 = rp am 0 + lea 4(%esi), %esi C up++ am 0 + lea (%esi), %eax C up2 = up am 0 + movd (%eax), %mm3 C am 0 + lea (%ebx), %ecx C inner loop count am 0 + pxor %mm6, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + movd 4(%eax), %mm0 C am 0 + movd (%edx), %mm5 C am 0 + pmuludq %mm7, %mm0 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + jmp L(am00) C am 0 + ALIGN(16) C am 0 +L(lam0): + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + movd (%eax), %mm3 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + paddq %mm5, %mm6 C am 0 + movd 4(%eax), %mm0 C am 0 + paddq %mm2, %mm4 C am 0 + movd (%edx), %mm5 C am 0 + movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm0 C am 0 + paddq %mm4, %mm6 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 +L(am00): + pmuludq %mm7, %mm1 C am 0 + paddq %mm5, %mm6 C am 0 + movd 12(%eax), %mm2 C am 0 + paddq %mm0, %mm4 C am 0 + movd 8(%edx), %mm5 C am 0 + movd %mm6, (%edx) C am 0 + psrlq $32, %mm6 C am 0 + lea 16(%eax), %eax C am 0 + lea 16(%edx), %edx C am 0 + sub $4, %ecx C am 0 + ja L(lam0) C am 0 + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm5, %mm6 C am 0 + paddq %mm2, %mm4 C am 0 +L(3): movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm4, %mm6 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 + movd %mm6, (%edx) C am 0 + sub $4, %ebx C am 0 + ja L(outer) C am 0 + + mov %edi, %edx + mov %esi, %eax + pop %edi + pop %ebx + pop %esi + +L(am3): C up[un-1..un-3] x up[un-4] + lea 8(%edx), %edx C rp2 += 2 + movd (%eax), %mm7 + movd 4(%eax), %mm1 + movd 8(%eax), %mm2 + movd 12(%eax), %mm3 + movd (%edx), %mm4 + pmuludq %mm7, %mm1 + movd 4(%edx), %mm5 + pmuludq %mm7, %mm2 + movd 8(%edx), %mm6 + pmuludq %mm7, %mm3 + paddq %mm1, %mm4 + paddq %mm2, %mm5 + paddq %mm3, %mm6 + movd %mm4, (%edx) + psrlq $32, %mm4 + paddq %mm5, %mm4 + movd %mm4, 4(%edx) + psrlq $32, %mm4 + paddq %mm6, %mm4 + movd %mm4, 8(%edx) + psrlq $32, %mm4 + movd %mm4, 12(%edx) C FIXME feed through! + lea 4(%eax), %eax + +L(am2): C up[un-1..un-2] x up[un-3] + lea 8(%edx), %edx C rp2 += 2 + movd (%eax), %mm7 + movd 4(%eax), %mm1 + movd 8(%eax), %mm2 + movd (%edx), %mm4 + movd 4(%edx), %mm5 + pmuludq %mm7, %mm1 + pmuludq %mm7, %mm2 + paddq %mm1, %mm4 + paddq %mm2, %mm5 + movd %mm4, (%edx) + psrlq $32, %mm4 + paddq %mm5, %mm4 + movd %mm4, 4(%edx) + psrlq $32, %mm4 + movd %mm4, 8(%edx) C FIXME feed through! + lea 4(%eax), %eax + +L(am1): C up[un-1] x up[un-2] + lea 8(%edx), %edx C rp2 += 2 + movd (%eax), %mm7 + movd 4(%eax), %mm2 + movd (%edx), %mm4 + pmuludq %mm7, %mm2 + paddq %mm2, %mm4 + movd %mm4, (%edx) + psrlq $32, %mm4 + movd %mm4, 4(%edx) + +C *** diag stuff, use elementary code for now + + mov 4(%esp), %edx C rp + mov 8(%esp), %eax C up + mov 12(%esp), %ecx C un + + movd (%eax), %mm2 + pmuludq %mm2, %mm2 C src[0]^2 + + pcmpeqd %mm7, %mm7 + psrlq $32, %mm7 + + movd 4(%edx), %mm3 C dst[1] + + movd %mm2, (%edx) + psrlq $32, %mm2 + + psllq $1, %mm3 C 2*dst[1] + paddq %mm3, %mm2 + movd %mm2, 4(%edx) + psrlq $32, %mm2 + + sub $2, %ecx + +L(diag): + movd 4(%eax), %mm0 C src limb + add $4, %eax + pmuludq %mm0, %mm0 + movq %mm7, %mm1 + pand %mm0, %mm1 C diagonal low + psrlq $32, %mm0 C diagonal high + + movd 8(%edx), %mm3 + psllq $1, %mm3 C 2*dst[i] + paddq %mm3, %mm1 + paddq %mm1, %mm2 + movd %mm2, 8(%edx) + psrlq $32, %mm2 + + movd 12(%edx), %mm3 + psllq $1, %mm3 C 2*dst[i+1] + paddq %mm3, %mm0 + paddq %mm0, %mm2 + movd %mm2, 12(%edx) + add $8, %edx + psrlq $32, %mm2 + + sub $1, %ecx + jnz L(diag) + + movd 4(%eax), %mm0 C src[size-1] + pmuludq %mm0, %mm0 + pand %mm0, %mm7 C diagonal low + psrlq $32, %mm0 C diagonal high + + movd 8(%edx), %mm3 C dst[2*size-2] + psllq $1, %mm3 + paddq %mm3, %mm7 + paddq %mm7, %mm2 + movd %mm2, 8(%edx) + psrlq $32, %mm2 + + paddq %mm0, %mm2 + movd %mm2, 12(%edx) C dst[2*size-1] + + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm new file mode 100644 index 0000000..5ba1c01 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm @@ -0,0 +1,119 @@ +dnl Intel Pentium-4 mpn_sub_n -- mpn subtraction. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C dst!=src1,2 dst==src1 dst==src2 +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 4 6 6 +C P4 model 3-4 (Prescott) 4.25 7.5 7.5 + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_EBX,`PARAM_SRC1') + + TEXT + ALIGN(8) + +PROLOGUE(mpn_sub_nc) +deflit(`FRAME',0) + movd PARAM_CARRY, %mm0 + jmp L(start_nc) +EPILOGUE() + + ALIGN(8) +PROLOGUE(mpn_sub_n) +deflit(`FRAME',0) + pxor %mm0, %mm0 +L(start_nc): + mov PARAM_SRC1, %eax + mov %ebx, SAVE_EBX + mov PARAM_SRC2, %ebx + mov PARAM_DST, %edx + mov PARAM_SIZE, %ecx + + lea (%eax,%ecx,4), %eax C src1 end + lea (%ebx,%ecx,4), %ebx C src2 end + lea (%edx,%ecx,4), %edx C dst end + neg %ecx C -size + +L(top): + C eax src1 end + C ebx src2 end + C ecx counter, limbs, negative + C edx dst end + C mm0 carry bit + + movd (%eax,%ecx,4), %mm1 + movd (%ebx,%ecx,4), %mm2 + psubq %mm2, %mm1 + + psubq %mm0, %mm1 + movd %mm1, (%edx,%ecx,4) + + psrlq $63, %mm1 + + add $1, %ecx + jz L(done_mm1) + + movd (%eax,%ecx,4), %mm0 + movd (%ebx,%ecx,4), %mm2 + psubq %mm2, %mm0 + + psubq %mm1, %mm0 + movd %mm0, (%edx,%ecx,4) + + psrlq $63, %mm0 + + add $1, %ecx + jnz L(top) + + movd %mm0, %eax + mov SAVE_EBX, %ebx + emms + ret + +L(done_mm1): + movd %mm1, %eax + mov SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm new file mode 100644 index 0000000..020675b --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm @@ -0,0 +1,182 @@ +dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) 6.8 +C P6 model 13 (Dothan) 6.9 +C P4 model 0-1 (Willamette) ? +C P4 model 2 (Northwood) 5.87 +C P4 model 3-4 (Prescott) 6.5 + +C This code represents a step forwards compared to the code available before +C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is +C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and +C Prescott compared to the old code. +C +C The arrangements made here to get a two instruction dependent chain are +C slightly subtle. In the loop the carry (or borrow rather) is a negative so +C that a paddq can be used to give a low limb ready to store, and a high limb +C ready to become the new carry after a psrlq. +C +C If the carry was a simple twos complement negative then the psrlq shift would +C need to bring in 0 bits or 1 bits according to whether the high was zero or +C non-zero, since a non-zero value would represent a negative needing sign +C extension. That wouldn't be particularly easy to arrange and certainly would +C add an instruction to the dependent chain, so instead an offset is applied so +C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to +C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore +C always positive and can always have 0 bits shifted in, which is what psrlq +C does. +C +C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be +C done off the dependent chain. The total adjustment then is to add +C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF +C to remove the offset from the current carry, for a net add of +C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when +C fetched. +C +C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement +C negative, which is how it's undone for the return value, but that doesn't +C seem as clear. + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(16) + +PROLOGUE(mpn_submul_1c) +deflit(`FRAME',0) + movd PARAM_CARRY, %mm1 + jmp L(start_1c) +EPILOGUE() + +PROLOGUE(mpn_submul_1) +deflit(`FRAME',0) + pxor %mm1, %mm1 C initial borrow + +L(start_1c): + mov PARAM_SRC, %eax + pcmpeqd %mm0, %mm0 + + movd PARAM_MULTIPLIER, %mm7 + pcmpeqd %mm6, %mm6 + + mov PARAM_DST, %edx + psrlq $32, %mm0 C 0x00000000FFFFFFFF + + mov PARAM_SIZE, %ecx + psllq $32, %mm6 C 0xFFFFFFFF00000000 + + psubq %mm0, %mm6 C 0xFFFFFFFE00000001 + + psubq %mm1, %mm0 C 0xFFFFFFFF - borrow + + + movd (%eax), %mm3 C up + movd (%edx), %mm4 C rp + + add $-1, %ecx + paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm3 + jnz L(gt1) + psubq %mm3, %mm4 C prod + paddq %mm4, %mm0 C borrow + movd %mm0, (%edx) C result + jmp L(rt) + +L(gt1): movd 4(%eax), %mm1 C up + movd 4(%edx), %mm2 C rp + + add $-1, %ecx + jz L(eev) + + ALIGN(16) +L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm1 + psubq %mm3, %mm4 C prod + movd 8(%eax), %mm3 C up + paddq %mm4, %mm0 C borrow + movd 8(%edx), %mm4 C rp + movd %mm0, (%edx) C result + psrlq $32, %mm0 + + add $-1, %ecx + jz L(eod) + + paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm3 + psubq %mm1, %mm2 C prod + movd 12(%eax), %mm1 C up + paddq %mm2, %mm0 C borrow + movd 12(%edx), %mm2 C rp + movd %mm0, 4(%edx) C result + psrlq $32, %mm0 + + lea 8(%eax), %eax + lea 8(%edx), %edx + add $-1, %ecx + jnz L(top) + + +L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm1 + psubq %mm3, %mm4 C prod + paddq %mm4, %mm0 C borrow + movd %mm0, (%edx) C result + psrlq $32, %mm0 + psubq %mm1, %mm2 C prod + paddq %mm2, %mm0 C borrow + movd %mm0, 4(%edx) C result +L(rt): psrlq $32, %mm0 + movd %mm0, %eax + not %eax + emms + ret + +L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001 + pmuludq %mm7, %mm3 + psubq %mm1, %mm2 C prod + paddq %mm2, %mm0 C borrow + movd %mm0, 4(%edx) C result + psrlq $32, %mm0 + psubq %mm3, %mm4 C prod + paddq %mm4, %mm0 C borrow + movd %mm0, 8(%edx) C result + jmp L(rt) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/rshift.asm b/gmp-6.3.0/mpn/x86/rshift.asm new file mode 100644 index 0000000..a60dcaa --- /dev/null +++ b/gmp-6.3.0/mpn/x86/rshift.asm @@ -0,0 +1,108 @@ +dnl x86 mpn_rshift -- mpn right shift. + +dnl Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P54 7.5 +C P55 7.0 +C P6 2.5 +C K6 4.5 +C K7 5.0 +C P4 16.5 + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_rshift) + + pushl %edi + pushl %esi + pushl %ebx +deflit(`FRAME',12) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%edx + movl PARAM_SHIFT,%ecx + + leal -4(%edi,%edx,4),%edi + leal (%esi,%edx,4),%esi + negl %edx + + movl (%esi,%edx,4),%ebx C read least significant limb + xorl %eax,%eax + shrdl( %cl, %ebx, %eax) C compute carry limb + incl %edx + jz L(end) + pushl %eax C push carry limb onto stack + testb $1,%dl + jnz L(1) C enter loop in the middle + movl %ebx,%eax + + ALIGN(8) +L(oop): movl (%esi,%edx,4),%ebx C load next higher limb + shrdl( %cl, %ebx, %eax) C compute result limb + movl %eax,(%edi,%edx,4) C store it + incl %edx +L(1): movl (%esi,%edx,4),%eax + shrdl( %cl, %eax, %ebx) + movl %ebx,(%edi,%edx,4) + incl %edx + jnz L(oop) + + shrl %cl,%eax C compute most significant limb + movl %eax,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebx + popl %esi + popl %edi + ret + +L(end): shrl %cl,%ebx C compute most significant limb + movl %ebx,(%edi) C store it + + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/sec_tabselect.asm b/gmp-6.3.0/mpn/x86/sec_tabselect.asm new file mode 100644 index 0000000..d9d9952 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/sec_tabselect.asm @@ -0,0 +1,106 @@ +dnl x86 mpn_sec_tabselect. + +dnl Copyright 2011, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) 4.5 +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C Intel Atom ? +C AMD K6 ? +C AMD K7 3.4 +C AMD K8 ? +C AMD K10 ? + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using SSE2 could result in many-fold speedup. + +C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `%edi') +define(`tp', `%esi') +define(`n', `%ebx') +define(`nents', `32(%esp)') +define(`which', `36(%esp)') + +define(`i', `%ebp') +define(`mask', `%ecx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sec_tabselect) + push %edi + push %esi + push %ebx + push %ebp + mov 20(%esp), rp + mov 24(%esp), tp + mov 28(%esp), n + + lea (rp,n,4), rp + lea (tp,n,4), tp +L(outer): + subl $1, which + sbb mask, mask + + mov n, i + neg i + + ALIGN(16) +L(top): mov (tp,i,4), %eax + mov (rp,i,4), %edx + xor %edx, %eax + and mask, %eax + xor %edx, %eax + mov %eax, (rp,i,4) + inc i + js L(top) + +L(end): lea (tp,n,4), tp + decl nents + jne L(outer) + +L(outer_end): + pop %ebp + pop %ebx + pop %esi + pop %edi + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h b/gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h new file mode 100644 index 0000000..e9f1d8f --- /dev/null +++ b/gmp-6.3.0/mpn/x86/silvermont/gmp-mparam.h @@ -0,0 +1,222 @@ +/* Intel Silvermont/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-30, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 5 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 64.62% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 32 + +#define DIV_1_VS_MUL_1_PERCENT 204 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 105 +#define MUL_TOOM44_THRESHOLD 236 +#define MUL_TOOM6H_THRESHOLD 351 +#define MUL_TOOM8H_THRESHOLD 502 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 163 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 174 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 215 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 36 +#define SQR_TOOM3_THRESHOLD 138 +#define SQR_TOOM4_THRESHOLD 360 +#define SQR_TOOM6_THRESHOLD 494 +#define SQR_TOOM8_THRESHOLD 620 + +#define MULMID_TOOM42_THRESHOLD 58 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415, 9}, { 831,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 735,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 863,10}, { 1727,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1727,10}, { 3455,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,11}, { 3455,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \ + { 7935,16} } +#define MUL_FFT_TABLE3_SIZE 177 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 400, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159,11}, { 95,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351, 9}, { 735,11}, { 191,10}, { 383, 9}, \ + { 799,10}, { 415, 9}, { 831,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 735, 9}, { 1471,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,13}, { 7679,16} } +#define SQR_FFT_TABLE3_SIZE 175 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 56 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 137 +#define SQRLO_SQR_THRESHOLD 7373 + +#define DC_DIV_QR_THRESHOLD 76 +#define DC_DIVAPPR_Q_THRESHOLD 336 +#define DC_BDIV_QR_THRESHOLD 66 +#define DC_BDIV_Q_THRESHOLD 218 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 345 +#define INV_APPR_THRESHOLD 342 + +#define BINV_NEWTON_THRESHOLD 366 +#define REDC_1_TO_REDC_N_THRESHOLD 91 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1858 +#define MUPI_DIV_QR_THRESHOLD 171 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1830 + +#define POWM_SEC_TABLE 3,17,102,404,1185 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 272 +#define SET_STR_PRECOMPUTE_THRESHOLD 788 + +#define FAC_DSC_THRESHOLD 132 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 0.59% faster than 3 */ +#define HGCD_THRESHOLD 142 +#define HGCD_APPR_THRESHOLD 181 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 492 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 1 /* 0.41% faster than 2 */ + +/* Tuneup completed successfully, took 147027 seconds */ diff --git a/gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h b/gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h new file mode 100644 index 0000000..fb87957 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/skylake/gmp-mparam.h @@ -0,0 +1,211 @@ +/* x86/skylake gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 15 +#define MOD_1_UNNORM_THRESHOLD 16 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 5.63% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 12 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 17 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 18 + +#define DIV_1_VS_MUL_1_PERCENT 348 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 208 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 149 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 196 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 40 +#define SQR_TOOM3_THRESHOLD 129 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 354 +#define SQR_TOOM8_THRESHOLD 608 + +#define MULMID_TOOM42_THRESHOLD 72 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 21 + +#define MUL_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 530, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \ + { 23, 8}, { 51, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,12}, { 127,11}, { 255,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1727,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2239,13}, { 1151,12}, { 2431,13}, \ + { 1279,12}, { 2623,13}, { 1407,12}, { 2815,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 154 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 460, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \ + { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,12}, { 63,11}, \ + { 127,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 351,11}, \ + { 191,10}, { 415,12}, { 127,11}, { 255,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 927,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 895,11}, \ + { 1791,14}, { 255,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2623,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 155 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 68 +#define MULLO_MUL_N_THRESHOLD 13555 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 117 +#define SQRLO_SQR_THRESHOLD 10988 + +#define DC_DIV_QR_THRESHOLD 42 +#define DC_DIVAPPR_Q_THRESHOLD 163 +#define DC_BDIV_QR_THRESHOLD 66 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 165 +#define INV_APPR_THRESHOLD 157 + +#define BINV_NEWTON_THRESHOLD 300 +#define REDC_1_TO_REDC_N_THRESHOLD 68 + +#define MU_DIV_QR_THRESHOLD 1718 +#define MU_DIVAPPR_Q_THRESHOLD 1685 +#define MUPI_DIV_QR_THRESHOLD 62 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1830 + +#define POWM_SEC_TABLE 1,17,129,547,1317 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 354 +#define SET_STR_PRECOMPUTE_THRESHOLD 860 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 5 /* 1.04% faster than 3 */ +#define HGCD_THRESHOLD 114 +#define HGCD_APPR_THRESHOLD 132 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 474 +#define GCDEXT_DC_THRESHOLD 379 +#define JACOBI_BASE_METHOD 1 /* 27.39% faster than 4 */ + +/* Tuneup completed successfully, took 31721 seconds */ diff --git a/gmp-6.3.0/mpn/x86/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/sqr_basecase.asm new file mode 100644 index 0000000..39f8a89 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/sqr_basecase.asm @@ -0,0 +1,359 @@ +dnl x86 generic mpn_sqr_basecase -- square an mpn number. + +dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C cycles/crossproduct cycles/triangleproduct +C P5 +C P6 +C K6 +C K7 +C P4 + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the size is +C small. +C +C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the +C code size to do so here. +C +C Enhancements: +C +C The addmul loop here is also not unrolled like aorsmul_1.asm and +C mul_basecase.asm are. Perhaps it should be done. It'd add to the +C complexity, but if it's worth doing in the other places then it should be +C worthwhile here. +C +C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6) +C might be worth considering. That'd add quite a bit to the code size, but +C only as much as is used would be dragged into L1 cache. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + + je L(two_limbs) + ja L(three_or_more) + + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx dst + C edx + + movl (%eax), %eax + mull %eax + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx + + pushl %ebx + pushl %ebp + + movl %eax, %ebx + movl (%eax), %eax + + mull %eax C src[0]^2 + + pushl %esi + pushl %edi + + movl %edx, %esi C dst[1] + movl %eax, (%ecx) C dst[0] + + movl 4(%ebx), %eax + mull %eax C src[1]^2 + + movl %eax, %edi C dst[2] + movl %edx, %ebp C dst[3] + + movl (%ebx), %eax + mull 4(%ebx) C src[0]*src[1] + + addl %eax, %esi + + adcl %edx, %edi + + adcl $0, %ebp + addl %esi, %eax + + adcl %edi, %edx + movl %eax, 4(%ecx) + + adcl $0, %ebp + + movl %edx, 8(%ecx) + movl %ebp, 12(%ecx) + + popl %edi + popl %esi + + popl %ebp + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(three_or_more): +deflit(`FRAME',0) + C eax src + C ebx + C ecx dst + C edx size + + pushl %ebx FRAME_pushl() + pushl %edi FRAME_pushl() + + pushl %esi FRAME_pushl() + pushl %ebp FRAME_pushl() + + leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1 + leal (%eax,%edx,4), %esi C &src[size] + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + + movl (%eax), %ebp C src[0], multiplier + movl %edx, %ecx + + negl %ecx C -size + xorl %ebx, %ebx C clear carry limb + + incl %ecx C -(size-1) + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, limbs, negative + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp multiplier + + movl (%esi,%ecx,4), %eax + mull %ebp + addl %eax, %ebx + adcl $0, %edx + movl %ebx, (%edi,%ecx,4) + movl %edx, %ebx + incl %ecx + jnz L(mul1) + + movl %ebx, (%edi) + + + C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for + C n=1..size-2. + C + C The last products src[size-2]*src[size-1], which is the end corner + C of the product triangle, is handled separately at the end to save + C looping overhead. If size is 3 then it's only this that needs to + C be done. + C + C In the outer loop %esi is a constant, and %edi just advances by 1 + C limb each time. The size of the operation decreases by 1 limb + C each time. + + C eax + C ebx carry (needing carry flag added) + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + subl $3, %ecx + jz L(corner) + + negl %ecx + +dnl re-use parameter space +define(VAR_OUTER,`PARAM_DST') + +L(outer): + C eax + C ebx + C ecx + C edx outer loop counter, -(size-3) to -1 + C esi &src[size] + C edi dst, pointing at stored carry limb of previous loop + C ebp + + movl %ecx, VAR_OUTER + addl $4, %edi C advance dst end + + movl -8(%esi,%ecx,4), %ebp C next multiplier + subl $1, %ecx + + xorl %ebx, %ebx C initial carry limb + +L(inner): + C eax scratch + C ebx carry (needing carry flag added) + C ecx counter, -n-1 to -1 + C edx scratch + C esi &src[size] + C edi dst end of this addmul + C ebp multiplier + + movl (%esi,%ecx,4), %eax + mull %ebp + addl %ebx, %eax + adcl $0, %edx + addl %eax, (%edi,%ecx,4) + adcl $0, %edx + movl %edx, %ebx + addl $1, %ecx + jl L(inner) + + + movl %ebx, (%edi) + movl VAR_OUTER, %ecx + incl %ecx + jnz L(outer) + + +L(corner): + C esi &src[size] + C edi &dst[2*size-3] + + movl -4(%esi), %eax + mull -8(%esi) C src[size-1]*src[size-2] + addl %eax, 0(%edi) + adcl $0, %edx + movl %edx, 4(%edi) C dst high limb + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + + movl PARAM_SIZE, %eax + negl %eax + addl $1, %eax C -(size-1) and clear carry + +L(lshift): + C eax counter, negative + C ebx next limb + C ecx + C edx + C esi + C edi &dst[2*size-4] + C ebp + + rcll 8(%edi,%eax,8) + rcll 12(%edi,%eax,8) + incl %eax + jnz L(lshift) + + + adcl %eax, %eax C high bit out + movl %eax, 8(%edi) C dst most significant limb + + +C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl PARAM_SRC, %esi + movl (%esi), %eax C src[0] + mull %eax C src[0]^2 + + movl PARAM_SIZE, %ecx + leal (%esi,%ecx,4), %esi C src end + + negl %ecx C -size + movl %edx, %ebx C initial carry + + movl %eax, 12(%edi,%ecx,8) C dst[0] + incl %ecx C -(size-1) + +L(diag): + C eax scratch (low product) + C ebx carry limb + C ecx counter, -(size-1) to -1 + C edx scratch (high product) + C esi &src[size] + C edi &dst[2*size-3] + C ebp scratch (fetched dst limbs) + + movl (%esi,%ecx,4), %eax + mull %eax + + addl %ebx, 8(%edi,%ecx,8) + movl %edx, %ebx + + adcl %eax, 12(%edi,%ecx,8) + adcl $0, %ebx + + incl %ecx + jnz L(diag) + + + addl %ebx, 8(%edi) C dst most significant limb + + popl %ebp + popl %esi + + popl %edi + popl %ebx + + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/t-zdisp.sh b/gmp-6.3.0/mpn/x86/t-zdisp.sh new file mode 100755 index 0000000..61efdd6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/t-zdisp.sh @@ -0,0 +1,71 @@ +#! /bin/sh +# +# Copyright 2000 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: cd $(builddir)/mpn +# $(srcdir)/x86/t-zdisp.sh +# +# Run the Zdisp() macro instructions through the assembler to check +# the encodings used. Mismatches are printed, no output means all ok. +# +# This program is only meant for use during development. It can be +# run in the mpn build directory of any x86 configuration. +# +# For this test the assembler needs to generate byte sized 0 +# displacements when given something like 0(%eax). Recent versions of +# gas are suitable (eg. 2.9.x or 2.10.x). + +set -e + +cat >tmp-zdisptest.asm <<\EOF + +include(`../config.m4') + +dnl Redefine Zdisp_match to output its pattern and encoding. +define(`Zdisp_match', +`define(`Zdisp_found',1)dnl +ifelse(`$2',0,` $1 $2$3, $4')`'dnl +ifelse(`$3',0,` $1 $2, $3$4')`'dnl + + .byte $5 +') + .text + Zdisp() +EOF + +m4 tmp-zdisptest.asm >tmp-zdisptest.s +as -o tmp-zdisptest.o tmp-zdisptest.s + +# Demand duplicates from the instruction patterns and byte encodings. +objdump -d tmp-zdisptest.o | awk ' +/^ *[a-z0-9]+:/ { + sub(/^ *[a-z0-9]+:/,"") + print +}' | sort | uniq -u diff --git a/gmp-6.3.0/mpn/x86/t-zdisp2.pl b/gmp-6.3.0/mpn/x86/t-zdisp2.pl new file mode 100755 index 0000000..b441b65 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/t-zdisp2.pl @@ -0,0 +1,147 @@ +#!/usr/bin/perl -w +# +# Copyright 2001, 2002 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of either: +# +# * the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# or +# +# * the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# or both in parallel, as here. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received copies of the GNU General Public License and the +# GNU Lesser General Public License along with the GNU MP Library. If not, +# see https://www.gnu.org/licenses/. + + +# Usage: cd $(builddir)/mpn +# $(srcdir)/x86/t-zdisp2.pl +# +# Grep for any "0(reg...)" addressing modes coming out of the x86 .asm +# files. Additive expressions like "12+4-16" are recognised too. +# +# Old gas doesn't preserve the "0" displacement, so if it's wanted then +# Zdisp ought to be used to give explicit .byte sequences. See +# mpn/x86/README. +# +# No output means everything is ok. All the asm files are put through m4 in +# PIC and non-PIC modes, and in each multi-function form, all of which can +# take a while to run. +# +# This program is only meant for use during development. + +use strict; +use File::Find; +use File::Basename; +use Getopt::Std; + +my %opt; +getopts('t', \%opt); + + +my $srcdir; +open IN, ') { + if (/^srcdir[ \t]*=[ \t]*(.*)/) { + $srcdir = $1; + last; + } +} +close IN or die; +defined $srcdir or die "Cannot find \$srcdir in Makefile\n"; + +my $filecount = 0; + +my $tempfile = 't-zdisp2.tmp'; +open KARA, ">$tempfile" or die; +close KARA or die; + +find({ wanted => \&process, preprocess => \&process_mparam, no_chdir => 1 }, + "$srcdir/x86"); + +sub process { + if (/gmp-mparam.h$/) { + process_mparam($_); + } elsif (/\.asm$/) { + process_asm($_); + } +} + +# Ensure we're using the right SQR_TOOM2_THRESHOLD for the part of the +# tree being processed. +sub process_mparam { + my $file = "$File::Find::dir/gmp-mparam.h"; + if (-f $file) { + print "$file\n" if $opt{'t'}; + open MPARAM, "<$file" or die; + while () { + if (/^#define SQR_TOOM2_THRESHOLD[ \t]*([0-9][0-9]*)/) { + open KARA, ">$tempfile" or die; + print KARA "define(\`SQR_TOOM2_THRESHOLD',$1)\n\n"; + print "define(\`SQR_TOOM2_THRESHOLD',$1)\n" if $opt{'t'}; + close KARA or die; + last; + } + } + close MPARAM or die; + } + return @_; +} + +sub process_asm { + my ($file) = @_; + my $base = basename ($file, '.asm'); + + my @funs; + if ($base eq 'aors_n') { @funs = qw(add_n sub_n); } + elsif ($base eq 'aorsmul_1') { @funs = qw(addmul_1 submul_1); } + elsif ($base eq 'popham') { @funs = qw(popcount hamdist); } + elsif ($base eq 'logops_n') { @funs = qw(and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n); } + elsif ($base eq 'lorrshift') { @funs = qw(lshift rshift); } + else { @funs = ($base); } + + foreach my $fun (@funs) { + foreach my $pic ('', ' -DPIC') { + my $header = "$file: 0: $pic\n"; + $filecount++; + + my $m4 = "m4 -DHAVE_HOST_CPU_athlon -DOPERATION_$fun $pic ../config.m4 $tempfile $file"; + print "$m4\n" if $opt{'t'}; + + open IN, "$m4 |" or die; + while () { + next unless /([0-9+-][0-9 \t+-]*)\(%/; + my $pat=$1; + $pat = eval($pat); + next if ($pat != 0); + print "$header$_"; + $header=''; + } + close IN or die; + } + } +} + +unlink($tempfile); +print "total $filecount processed\n"; +exit 0; + + +# Local variables: +# perl-indent-level: 2 +# End: diff --git a/gmp-6.3.0/mpn/x86/udiv.asm b/gmp-6.3.0/mpn/x86/udiv.asm new file mode 100644 index 0000000..a3ee088 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/udiv.asm @@ -0,0 +1,52 @@ +dnl x86 mpn_udiv_qrnnd -- 2 by 1 limb division + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low, +C mp_limb_t divisor); + +defframe(PARAM_DIVISOR, 16) +defframe(PARAM_LOW, 12) +defframe(PARAM_HIGH, 8) +defframe(PARAM_REMPTR, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_udiv_qrnnd) +deflit(`FRAME',0) + movl PARAM_LOW, %eax + movl PARAM_HIGH, %edx + divl PARAM_DIVISOR + movl PARAM_REMPTR, %ecx + movl %edx, (%ecx) + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/umul.asm b/gmp-6.3.0/mpn/x86/umul.asm new file mode 100644 index 0000000..34fe434 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/umul.asm @@ -0,0 +1,51 @@ +dnl mpn_umul_ppmm -- 1x1->2 limb multiplication + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2); +C + +defframe(PARAM_M2, 12) +defframe(PARAM_M1, 8) +defframe(PARAM_LOWPTR, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_umul_ppmm) +deflit(`FRAME',0) + movl PARAM_LOWPTR, %ecx + movl PARAM_M1, %eax + mull PARAM_M2 + movl %eax, (%ecx) + movl %edx, %eax + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86/x86-defs.m4 b/gmp-6.3.0/mpn/x86/x86-defs.m4 new file mode 100644 index 0000000..81309b2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/x86-defs.m4 @@ -0,0 +1,1024 @@ +divert(-1) + +dnl m4 macros for x86 assembler. + +dnl Copyright 1999-2003, 2007, 2010, 2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Notes: +dnl +dnl m4 isn't perfect for processing BSD style x86 assembler code, the main +dnl problems are, +dnl +dnl 1. Doing define(foo,123) and then using foo in an addressing mode like +dnl foo(%ebx) expands as a macro rather than a constant. This is worked +dnl around by using deflit() from asm-defs.m4, instead of define(). +dnl +dnl 2. Immediates in macro definitions need a space or `' to stop the $ +dnl looking like a macro parameter. For example, +dnl +dnl define(foo, `mov $ 123, %eax') +dnl +dnl This is only a problem in macro definitions, not in ordinary text, +dnl and not in macro parameters like text passed to forloop() or ifdef(). + + +deflit(GMP_LIMB_BYTES, 4) + + +dnl Libtool gives -DPIC -DDLL_EXPORT to indicate a cygwin or mingw DLL. We +dnl undefine PIC since we don't need to be position independent in this +dnl case and definitely don't want the ELF style _GLOBAL_OFFSET_TABLE_ etc. + +ifdef(`DLL_EXPORT',`undefine(`PIC')') + + +dnl Usage: CPUVEC_FUNCS_LIST +dnl +dnl A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the +dnl order they appear in that structure. + +define(CPUVEC_FUNCS_LIST, +``add_n', +`addlsh1_n', +`addlsh2_n', +`addmul_1', +`addmul_2', +`bdiv_dbm1c', +`cnd_add_n', +`cnd_sub_n', +`com', +`copyd', +`copyi', +`divexact_1', +`divrem_1', +`gcd_11', +`lshift', +`lshiftc', +`mod_1', +`mod_1_1p', +`mod_1_1p_cps', +`mod_1s_2p', +`mod_1s_2p_cps', +`mod_1s_4p', +`mod_1s_4p_cps', +`mod_34lsub1', +`modexact_1c_odd', +`mul_1', +`mul_basecase', +`mullo_basecase', +`preinv_divrem_1', +`preinv_mod_1', +`redc_1', +`redc_2', +`rshift', +`sqr_basecase', +`sub_n', +`sublsh1_n', +`submul_1'') + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl In the x86 code we use explicit TEXT and ALIGN() calls in the code, +dnl since different alignments are wanted in various circumstances. So for +dnl instance, +dnl +dnl TEXT +dnl ALIGN(16) +dnl PROLOGUE(mpn_add_n) +dnl ... +dnl EPILOGUE() + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) +m4_assert_defined(`WANT_PROFILING') + `GLOBL $1 + TYPE($1,`function') + COFF_TYPE($1) +$1: +ifelse(WANT_PROFILING,`prof', ` call_mcount') +ifelse(WANT_PROFILING,`gprof', ` call_mcount') +ifelse(WANT_PROFILING,`instrument',` call_instrument(enter)') +') + + +dnl Usage: COFF_TYPE(GSYM_PREFIX`'foo) +dnl +dnl Emit COFF style ".def ... .endef" type information for a function, when +dnl supported. The argument should include any GSYM_PREFIX. +dnl +dnl See autoconf macro GMP_ASM_COFF_TYPE for HAVE_COFF_TYPE. + +define(COFF_TYPE, +m4_assert_numargs(1) +m4_assert_defined(`HAVE_COFF_TYPE') +`ifelse(HAVE_COFF_TYPE,yes, + `.def $1 + .scl 2 + .type 32 + .endef')') + + +dnl Usage: call_mcount +dnl +dnl For `gprof' style profiling, %ebp is setup as a frame pointer. None of +dnl the assembler routines use %ebp this way, so it's done only for the +dnl benefit of mcount. glibc sysdeps/i386/i386-mcount.S shows how mcount +dnl gets the current function from (%esp) and the parent from 4(%ebp). +dnl +dnl For `prof' style profiling gcc generates mcount calls without setting +dnl up %ebp, and the same is done here. + +define(`call_mcount', +m4_assert_numargs(-1) +m4_assert_defined(`WANT_PROFILING') +m4_assert_defined(`MCOUNT_PIC_REG') +m4_assert_defined(`MCOUNT_NONPIC_REG') +m4_assert_defined(`MCOUNT_PIC_CALL') +m4_assert_defined(`MCOUNT_NONPIC_CALL') +`ifelse(ifdef(`PIC',`MCOUNT_PIC_REG',`MCOUNT_NONPIC_REG'),,, +` DATA + ALIGN(4) +L(mcount_data_`'mcount_counter): + W32 0 + TEXT +')dnl +ifelse(WANT_PROFILING,`gprof', +` pushl %ebp + movl %esp, %ebp +')dnl +ifdef(`PIC', +` pushl %ebx + call_movl_eip_to_ebx +L(mcount_here_`'mcount_counter): + addl $_GLOBAL_OFFSET_TABLE_+[.-L(mcount_here_`'mcount_counter)], %ebx +ifelse(MCOUNT_PIC_REG,,, +` leal L(mcount_data_`'mcount_counter)@GOTOFF(%ebx), MCOUNT_PIC_REG') +MCOUNT_PIC_CALL + popl %ebx +',`dnl non-PIC +ifelse(MCOUNT_NONPIC_REG,,, +` movl `$'L(mcount_data_`'mcount_counter), MCOUNT_NONPIC_REG +')dnl +MCOUNT_NONPIC_CALL +')dnl +ifelse(WANT_PROFILING,`gprof', +` popl %ebp +') +define(`mcount_counter',incr(mcount_counter)) +') + +define(mcount_counter,1) + + +dnl Usage: call_instrument(enter|exit) +dnl +dnl Call __cyg_profile_func_enter or __cyg_profile_func_exit. +dnl +dnl For PIC, most routines don't require _GLOBAL_OFFSET_TABLE_ themselves +dnl so %ebx is just setup for these calls. It's a bit wasteful to repeat +dnl the setup for the exit call having done it earlier for the enter, but +dnl there's nowhere very convenient to hold %ebx through the length of a +dnl routine, in general. +dnl +dnl For PIC, because instrument_current_function will be within the current +dnl object file we can get it just as an offset from %eip, there's no need +dnl to use the GOT. +dnl +dnl No attempt is made to maintain the stack alignment gcc generates with +dnl -mpreferred-stack-boundary. This wouldn't be hard, but it seems highly +dnl unlikely the instrumenting functions would be doing anything that'd +dnl benefit from alignment, in particular they're unlikely to be using +dnl doubles or long doubles on the stack. +dnl +dnl The FRAME scheme is used to conveniently account for the register saves +dnl before accessing the return address. Any previous value is saved and +dnl restored, since plenty of code keeps a value across a "ret" in the +dnl middle of a routine. + +define(call_instrument, +m4_assert_numargs(1) +` pushdef(`FRAME',0) +ifelse($1,exit, +` pushl %eax FRAME_pushl() C return value +') +ifdef(`PIC', +` pushl %ebx FRAME_pushl() + call_movl_eip_to_ebx +L(instrument_here_`'instrument_count): + movl %ebx, %ecx + addl $_GLOBAL_OFFSET_TABLE_+[.-L(instrument_here_`'instrument_count)], %ebx + C use addl rather than leal to avoid old gas bugs, see mpn/x86/README + addl $instrument_current_function-L(instrument_here_`'instrument_count), %ecx + pushl m4_empty_if_zero(FRAME)(%esp) FRAME_pushl() C return addr + pushl %ecx FRAME_pushl() C this function + call GSYM_PREFIX`'__cyg_profile_func_$1@PLT + addl $`'8, %esp + popl %ebx +', +` C non-PIC + pushl m4_empty_if_zero(FRAME)(%esp) FRAME_pushl() C return addr + pushl $instrument_current_function FRAME_pushl() C this function + call GSYM_PREFIX`'__cyg_profile_func_$1 + addl $`'8, %esp +') +ifelse($1,exit, +` popl %eax C return value +') + popdef(`FRAME') +define(`instrument_count',incr(instrument_count)) +') +define(instrument_count,1) + + +dnl Usage: instrument_current_function +dnl +dnl Return the current function name for instrumenting purposes. This is +dnl PROLOGUE_current_function, but it sticks at the first such name seen. +dnl +dnl Sticking to the first name seen ensures that multiple-entrypoint +dnl functions like mpn_add_nc and mpn_add_n will make enter and exit calls +dnl giving the same function address. + +define(instrument_current_function, +m4_assert_numargs(-1) +`ifdef(`instrument_current_function_seen', +`instrument_current_function_seen', +`define(`instrument_current_function_seen',PROLOGUE_current_function)dnl +PROLOGUE_current_function')') + + +dnl Usage: call_movl_eip_to_ebx +dnl +dnl Generate a call to L(movl_eip_to_ebx), and record the need for that +dnl routine. + +define(call_movl_eip_to_ebx, +m4_assert_numargs(-1) +`call L(movl_eip_to_ebx) +define(`movl_eip_to_ebx_needed',1)') + +dnl Usage: generate_movl_eip_to_ebx +dnl +dnl Emit a L(movl_eip_to_ebx) routine, if needed and not already generated. + +define(generate_movl_eip_to_ebx, +m4_assert_numargs(-1) +`ifelse(movl_eip_to_ebx_needed,1, +`ifelse(movl_eip_to_ebx_done,1,, +`L(movl_eip_to_ebx): + movl (%esp), %ebx + ret_internal +define(`movl_eip_to_ebx_done',1) +')')') + + +dnl Usage: ret +dnl +dnl Generate a "ret", but if doing instrumented profiling then call +dnl __cyg_profile_func_exit first. + +define(ret, +m4_assert_numargs(-1) +m4_assert_defined(`WANT_PROFILING') +`ifelse(WANT_PROFILING,instrument, +`ret_instrument', +`ret_internal') +generate_movl_eip_to_ebx +') + + +dnl Usage: ret_internal +dnl +dnl A plain "ret", without any __cyg_profile_func_exit call. This can be +dnl used for a return which is internal to some function, such as when +dnl getting %eip for PIC. + +define(ret_internal, +m4_assert_numargs(-1) +``ret'') + + +dnl Usage: ret_instrument +dnl +dnl Generate call to __cyg_profile_func_exit and then a ret. If a ret has +dnl already been seen from this function then jump to that chunk of code, +dnl rather than emitting it again. + +define(ret_instrument, +m4_assert_numargs(-1) +`ifelse(m4_unquote(ret_instrument_seen_`'instrument_current_function),1, +`jmp L(instrument_exit_`'instrument_current_function)', +`define(ret_instrument_seen_`'instrument_current_function,1) +L(instrument_exit_`'instrument_current_function): +call_instrument(exit) + ret_internal')') + + +dnl Usage: _GLOBAL_OFFSET_TABLE_ +dnl +dnl Expand to _GLOBAL_OFFSET_TABLE_ plus any necessary underscore prefix. +dnl This lets us write plain _GLOBAL_OFFSET_TABLE_ in SVR4 style, but still +dnl work with systems requiring an extra underscore such as OpenBSD. +dnl +dnl deflit is used so "leal _GLOBAL_OFFSET_TABLE_(%eax), %ebx" will come +dnl out right, though that form doesn't work properly in gas (see +dnl mpn/x86/README). + +deflit(_GLOBAL_OFFSET_TABLE_, +m4_assert_defined(`GOT_GSYM_PREFIX') +`GOT_GSYM_PREFIX`_GLOBAL_OFFSET_TABLE_'') + + +dnl -------------------------------------------------------------------------- +dnl Various x86 macros. +dnl + + +dnl Usage: ALIGN_OFFSET(bytes,offset) +dnl +dnl Align to `offset' away from a multiple of `bytes'. +dnl +dnl This is useful for testing, for example align to something very strict +dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)". +dnl +dnl Generally you wouldn't execute across the padding, but it's done with +dnl nop's so it'll work. + +define(ALIGN_OFFSET, +m4_assert_numargs(2) +`ALIGN($1) +forloop(`i',1,$2,` nop +')') + + +dnl Usage: defframe(name,offset) +dnl +dnl Make a definition like the following with which to access a parameter +dnl or variable on the stack. +dnl +dnl define(name,`FRAME+offset(%esp)') +dnl +dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one +dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp). +dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the +dnl zero offset is wanted. +dnl +dnl The new macro also gets a check that when it's used FRAME is actually +dnl defined, and that the final %esp offset isn't negative, which would +dnl mean an attempt to access something below the current %esp. +dnl +dnl deflit() is used rather than a plain define(), so the new macro won't +dnl delete any following parenthesized expression. name(%edi) will come +dnl out say as 16(%esp)(%edi). This isn't valid assembler and should +dnl provoke an error, which is better than silently giving just 16(%esp). +dnl +dnl See README for more on the suggested way to access the stack frame. + +define(defframe, +m4_assert_numargs(2) +`deflit(`$1', +m4_assert_defined(`FRAME') +`defframe_check_notbelow(`$1',$2,FRAME)dnl +defframe_empty_if_zero(FRAME+($2))(%esp)')') + +dnl Called: defframe_empty_if_zero(expression) +define(defframe_empty_if_zero, +m4_assert_numargs(1) +`ifelse(defframe_empty_if_zero_disabled,1, +`eval($1)', +`m4_empty_if_zero($1)')') + +dnl Called: defframe_check_notbelow(`name',offset,FRAME) +define(defframe_check_notbelow, +m4_assert_numargs(3) +`ifelse(eval(($3)+($2)<0),1, +`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes +')')') + + +dnl Usage: FRAME_pushl() +dnl FRAME_popl() +dnl FRAME_addl_esp(n) +dnl FRAME_subl_esp(n) +dnl +dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl +dnl %esp of n bytes. +dnl +dnl Using these macros is completely optional. Sometimes it makes more +dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's +dnl jumps and different sequences of FRAME values need to be used in +dnl different places. + +define(FRAME_pushl, +m4_assert_numargs(0) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME+4))') + +define(FRAME_popl, +m4_assert_numargs(0) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME-4))') + +define(FRAME_addl_esp, +m4_assert_numargs(1) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME-($1)))') + +define(FRAME_subl_esp, +m4_assert_numargs(1) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME+($1)))') + + +dnl Usage: defframe_pushl(name) +dnl +dnl Do a combination FRAME_pushl() and a defframe() to name the stack +dnl location just pushed. This should come after a pushl instruction. +dnl Putting it on the same line works and avoids lengthening the code. For +dnl example, +dnl +dnl pushl %eax defframe_pushl(VAR_COUNTER) +dnl +dnl Notice the defframe() is done with an unquoted -FRAME thus giving its +dnl current value without tracking future changes. + +define(defframe_pushl, +m4_assert_numargs(1) +`FRAME_pushl()defframe(`$1',-FRAME)') + + +dnl -------------------------------------------------------------------------- +dnl Assembler instruction macros. +dnl + + +dnl Usage: emms_or_femms +dnl femms_available_p +dnl +dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow +dnl femms instruction is available. emms_or_femms expands to femms if +dnl available, or emms if not. +dnl +dnl emms_or_femms is meant for use in the K6 directory where plain K6 +dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are +dnl supported together. +dnl +dnl On K7 femms is no longer faster and is just an alias for emms, so plain +dnl emms may as well be used. + +define(femms_available_p, +m4_assert_numargs(-1) +`m4_ifdef_anyof_p( + `HAVE_HOST_CPU_k62', + `HAVE_HOST_CPU_k63', + `HAVE_HOST_CPU_athlon')') + +define(emms_or_femms, +m4_assert_numargs(-1) +`ifelse(femms_available_p,1,`femms',`emms')') + + +dnl Usage: femms +dnl +dnl Gas 2.9.1 which comes with FreeBSD 3.4 doesn't support femms, so the +dnl following is a replacement using .byte. + +define(femms, +m4_assert_numargs(-1) +`.byte 15,14 C AMD 3DNow femms') + + +dnl Usage: jadcl0(op) +dnl +dnl Generate a jnc/incl as a substitute for adcl $0,op. Note this isn't an +dnl exact replacement, since it doesn't set the flags like adcl does. +dnl +dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and +dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch +dnl misprediction penalty is small, and the multiply algorithm used leads +dnl to a carry bit on average only 1/4 of the time. +dnl +dnl jadcl0_disabled can be set to 1 to instead generate an ordinary adcl +dnl for comparison. For example, +dnl +dnl define(`jadcl0_disabled',1) +dnl +dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is +dnl the same size as an adcl. This makes it possible to use the exact same +dnl computed jump code when testing the relative speed of the two. + +define(jadcl0, +m4_assert_numargs(1) +`ifelse(jadcl0_disabled,1, + `adcl $`'0, $1', + `jnc L(jadcl0_`'jadcl0_counter) + incl $1 +L(jadcl0_`'jadcl0_counter): +define(`jadcl0_counter',incr(jadcl0_counter))')') + +define(jadcl0_counter,1) + + +dnl Usage: x86_lookup(target, key,value, key,value, ...) +dnl x86_lookup_p(target, key,value, key,value, ...) +dnl +dnl Look for `target' among the `key' parameters. +dnl +dnl x86_lookup expands to the corresponding `value', or generates an error +dnl if `target' isn't found. +dnl +dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not. + +define(x86_lookup, +m4_assert_numargs_range(1,999) +`ifelse(eval($#<3),1, +`m4_error(`unrecognised part of x86 instruction: $1 +')', +`ifelse(`$1',`$2', `$3', +`x86_lookup(`$1',shift(shift(shift($@))))')')') + +define(x86_lookup_p, +m4_assert_numargs_range(1,999) +`ifelse(eval($#<3),1, `0', +`ifelse(`$1',`$2', `1', +`x86_lookup_p(`$1',shift(shift(shift($@))))')')') + + +dnl Usage: x86_opcode_reg32(reg) +dnl x86_opcode_reg32_p(reg) +dnl +dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given +dnl 32-bit register, eg. `%ebp' turns into 5. +dnl +dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0 +dnl if not. + +define(x86_opcode_reg32, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_reg32_list)') + +define(x86_opcode_reg32_p, +m4_assert_onearg() +`x86_lookup_p(`$1',x86_opcode_reg32_list)') + +define(x86_opcode_reg32_list, +``%eax',0, +`%ecx',1, +`%edx',2, +`%ebx',3, +`%esp',4, +`%ebp',5, +`%esi',6, +`%edi',7') + + +dnl Usage: x86_opcode_tttn(cond) +dnl +dnl Expand to the 4-bit "tttn" field value for the given x86 branch +dnl condition (like `c', `ae', etc). + +define(x86_opcode_tttn, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_ttn_list)') + +define(x86_opcode_tttn_list, +``o', 0, +`no', 1, +`b', 2, `c', 2, `nae',2, +`nb', 3, `nc', 3, `ae', 3, +`e', 4, `z', 4, +`ne', 5, `nz', 5, +`be', 6, `na', 6, +`nbe', 7, `a', 7, +`s', 8, +`ns', 9, +`p', 10, `pe', 10, `npo',10, +`np', 11, `npe',11, `po', 11, +`l', 12, `nge',12, +`nl', 13, `ge', 13, +`le', 14, `ng', 14, +`nle',15, `g', 15') + + +dnl Usage: cmovCC(%srcreg,%dstreg) +dnl +dnl Emit a cmov instruction, using a .byte sequence, since various past +dnl versions of gas don't know cmov. For example, +dnl +dnl cmovz( %eax, %ebx) +dnl +dnl The source operand can only be a plain register. (m4 code implementing +dnl full memory addressing modes exists, believe it or not, but isn't +dnl currently needed and isn't included.) +dnl +dnl All the standard conditions are defined. Attempting to use one without +dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke +dnl an error. This protects against writing something old gas wouldn't +dnl understand. + +dnl Called: define_cmov_many(cond,tttn,cond,tttn,...) +define(define_cmov_many, +`ifelse(m4_length(`$1'),0,, +`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')') + +dnl Called: define_cmov(cond,tttn) +dnl Emit basically define(cmov,`cmov_internal(,,`$1',`$2')') +define(define_cmov, +m4_assert_numargs(2) +`define(`cmov$1', +m4_instruction_wrapper() +m4_assert_numargs(2) +`cmov_internal'(m4_doublequote($`'0),``$2'',dnl +m4_doublequote($`'1),m4_doublequote($`'2)))') + +define_cmov_many(x86_opcode_tttn_list) + +dnl Called: cmov_internal(name,tttn,src,dst) +define(cmov_internal, +m4_assert_numargs(4) +`.byte dnl +15, dnl +eval(64+$2), dnl +eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl + C `$1 $3, $4'') + + +dnl Usage: x86_opcode_regmmx(reg) +dnl +dnl Validate the given mmx register, and return its number, 0 to 7. + +define(x86_opcode_regmmx, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_regmmx_list)') + +define(x86_opcode_regmmx_list, +``%mm0',0, +`%mm1',1, +`%mm2',2, +`%mm3',3, +`%mm4',4, +`%mm5',5, +`%mm6',6, +`%mm7',7') + + +dnl Usage: psadbw(%srcreg,%dstreg) +dnl +dnl Oldish versions of gas don't know psadbw, in particular gas 2.9.1 on +dnl FreeBSD 3.3 and 3.4 doesn't, so instead emit .byte sequences. For +dnl example, +dnl +dnl psadbw( %mm1, %mm2) +dnl +dnl Only register->register forms are supported here, which suffices for +dnl the current code. + +define(psadbw, +m4_instruction_wrapper() +m4_assert_numargs(2) +`.byte 0x0f,0xf6,dnl +eval(192+x86_opcode_regmmx(`$2')*8+x86_opcode_regmmx(`$1')) dnl + C `psadbw $1, $2'') + + +dnl Usage: Zdisp(inst,op,op,op) +dnl +dnl Generate explicit .byte sequences if necessary to force a byte-sized +dnl zero displacement on an instruction. For example, +dnl +dnl Zdisp( movl, 0,(%esi), %eax) +dnl +dnl expands to +dnl +dnl .byte 139,70,0 C movl 0(%esi), %eax +dnl +dnl If the displacement given isn't 0, then normal assembler code is +dnl generated. For example, +dnl +dnl Zdisp( movl, 4,(%esi), %eax) +dnl +dnl expands to +dnl +dnl movl 4(%esi), %eax +dnl +dnl This means a single Zdisp() form can be used with an expression for the +dnl displacement, and .byte will be used only if necessary. The +dnl displacement argument is eval()ed. +dnl +dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is +dnl implemented with a table of instructions and encodings. A new entry is +dnl needed for any different operation or registers. The table is split +dnl into separate macros to avoid overflowing BSD m4 macro expansion space. + +define(Zdisp, +m4_assert_numargs(4) +`define(`Zdisp_found',0)dnl +Zdisp_1($@)dnl +Zdisp_2($@)dnl +Zdisp_3($@)dnl +Zdisp_4($@)dnl +ifelse(Zdisp_found,0, +`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4 +')')') + +define(Zdisp_1,`dnl +Zdisp_match( adcl, 0,(%edx), %eax, `0x13,0x42,0x00', $@)`'dnl +Zdisp_match( adcl, 0,(%edx), %ebx, `0x13,0x5a,0x00', $@)`'dnl +Zdisp_match( adcl, 0,(%edx), %esi, `0x13,0x72,0x00', $@)`'dnl +Zdisp_match( addl, %ebx, 0,(%edi), `0x01,0x5f,0x00', $@)`'dnl +Zdisp_match( addl, %ecx, 0,(%edi), `0x01,0x4f,0x00', $@)`'dnl +Zdisp_match( addl, %esi, 0,(%edi), `0x01,0x77,0x00', $@)`'dnl +Zdisp_match( sbbl, 0,(%edx), %eax, `0x1b,0x42,0x00', $@)`'dnl +Zdisp_match( sbbl, 0,(%edx), %esi, `0x1b,0x72,0x00', $@)`'dnl +Zdisp_match( subl, %ecx, 0,(%edi), `0x29,0x4f,0x00', $@)`'dnl +Zdisp_match( movzbl, 0,(%eax,%ebp), %eax, `0x0f,0xb6,0x44,0x28,0x00', $@)`'dnl +Zdisp_match( movzbl, 0,(%ecx,%edi), %edi, `0x0f,0xb6,0x7c,0x39,0x00', $@)`'dnl +Zdisp_match( adc, 0,(%ebx,%ecx,4), %eax, `0x13,0x44,0x8b,0x00', $@)`'dnl +Zdisp_match( sbb, 0,(%ebx,%ecx,4), %eax, `0x1b,0x44,0x8b,0x00', $@)`'dnl +') +define(Zdisp_2,`dnl +Zdisp_match( movl, %eax, 0,(%edi), `0x89,0x47,0x00', $@)`'dnl +Zdisp_match( movl, %ebx, 0,(%edi), `0x89,0x5f,0x00', $@)`'dnl +Zdisp_match( movl, %esi, 0,(%edi), `0x89,0x77,0x00', $@)`'dnl +Zdisp_match( movl, 0,(%ebx), %eax, `0x8b,0x43,0x00', $@)`'dnl +Zdisp_match( movl, 0,(%ebx), %esi, `0x8b,0x73,0x00', $@)`'dnl +Zdisp_match( movl, 0,(%edx), %eax, `0x8b,0x42,0x00', $@)`'dnl +Zdisp_match( movl, 0,(%esi), %eax, `0x8b,0x46,0x00', $@)`'dnl +Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl +Zdisp_match( mov, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl +Zdisp_match( mov, %eax, 0,(%edi,%ecx,4), `0x89,0x44,0x8f,0x00', $@)`'dnl +') +define(Zdisp_3,`dnl +Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%ebx,%ecx,4), %mm0, `0x0f,0x6f,0x44,0x8b,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%edx), %mm0, `0x0f,0x6f,0x42,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%esi), %mm0, `0x0f,0x6f,0x46,0x00', $@)`'dnl +Zdisp_match( movq, %mm0, 0,(%edi), `0x0f,0x7f,0x47,0x00', $@)`'dnl +Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl +Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl +Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl +') +define(Zdisp_4,`dnl +Zdisp_match( movd, 0,(%eax,%ecx,4), %mm0, `0x0f,0x6e,0x44,0x88,0x00', $@)`'dnl +Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl +Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl +Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl +Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl +Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl +Zdisp_match( movd, %mm0, 0,(%edx,%ecx,4), `0x0f,0x7e,0x44,0x8a,0x00', $@)`'dnl +') + +define(Zdisp_match, +m4_assert_numargs(9) +`ifelse(eval(m4_stringequal_p(`$1',`$6') + && m4_stringequal_p(`$2',0) + && m4_stringequal_p(`$3',`$8') + && m4_stringequal_p(`$4',`$9')),1, +`define(`Zdisp_found',1)dnl +ifelse(eval(`$7'),0, +` .byte $5 C `$1 0$3, $4'', +` $6 $7$8, $9')', + +`ifelse(eval(m4_stringequal_p(`$1',`$6') + && m4_stringequal_p(`$2',`$7') + && m4_stringequal_p(`$3',0) + && m4_stringequal_p(`$4',`$9')),1, +`define(`Zdisp_found',1)dnl +ifelse(eval(`$8'),0, +` .byte $5 C `$1 $2, 0$4'', +` $6 $7, $8$9')')')') + + +dnl Usage: shldl(count,src,dst) +dnl shrdl(count,src,dst) +dnl shldw(count,src,dst) +dnl shrdw(count,src,dst) +dnl +dnl Generate a double-shift instruction, possibly omitting a %cl count +dnl parameter if that's what the assembler requires, as indicated by +dnl WANT_SHLDL_CL in config.m4. For example, +dnl +dnl shldl( %cl, %eax, %ebx) +dnl +dnl turns into either +dnl +dnl shldl %cl, %eax, %ebx +dnl or +dnl shldl %eax, %ebx +dnl +dnl Immediate counts are always passed through unchanged. For example, +dnl +dnl shrdl( $2, %esi, %edi) +dnl becomes +dnl shrdl $2, %esi, %edi +dnl +dnl +dnl If you forget to use the macro form "shldl( ...)" and instead write +dnl just a plain "shldl ...", an error results. This ensures the necessary +dnl variant treatment of %cl isn't accidentally bypassed. + +define(define_shd_instruction, +m4_assert_numargs(1) +`define($1, +m4_instruction_wrapper() +m4_assert_numargs(3) +`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl +m4_doublequote($`'2),m4_doublequote($`'3)))') + +dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc +define_shd_instruction(shldl) +define_shd_instruction(shrdl) +define_shd_instruction(shldw) +define_shd_instruction(shrdw) + +dnl Called: shd_instruction(op,count,src,dst) +define(shd_instruction, +m4_assert_numargs(4) +m4_assert_defined(`WANT_SHLDL_CL') +`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1, +``$1' `$3', `$4'', +``$1' `$2', `$3', `$4'')') + + +dnl Usage: ASSERT([cond][,instructions]) +dnl +dnl If WANT_ASSERT is 1, output the given instructions and expect the given +dnl flags condition to then be satisfied. For example, +dnl +dnl ASSERT(ne, `cmpl %eax, %ebx') +dnl +dnl The instructions can be omitted to just assert a flags condition with +dnl no extra calculation. For example, +dnl +dnl ASSERT(nc) +dnl +dnl When `instructions' is not empty, a pushf/popf is added to preserve the +dnl flags, but the instructions themselves must preserve any registers that +dnl matter. FRAME is adjusted for the push and pop, so the instructions +dnl given can use defframe() stack variables. +dnl +dnl The condition can be omitted to just output the given instructions when +dnl assertion checking is wanted. In this case the pushf/popf is omitted. +dnl For example, +dnl +dnl ASSERT(, `movl %eax, VAR_KEEPVAL') + +define(ASSERT, +m4_assert_numargs_range(1,2) +m4_assert_defined(`WANT_ASSERT') +`ifelse(WANT_ASSERT,1, +`ifelse(`$1',, + `$2', + `C ASSERT +ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')') + $2 + j`$1' L(ASSERT_ok`'ASSERT_counter) + ud2 C assertion failed +L(ASSERT_ok`'ASSERT_counter): +ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')') +define(`ASSERT_counter',incr(ASSERT_counter))')')') + +define(ASSERT_counter,1) + + +dnl Usage: movl_text_address(label,register) +dnl +dnl Get the address of a text segment label, using either a plain movl or a +dnl position-independent calculation, as necessary. For example, +dnl +dnl movl_code_address(L(foo),%eax) +dnl +dnl This macro is only meant for use in ASSERT()s or when testing, since +dnl the PIC sequence it generates will want to be done with a ret balancing +dnl the call on CPUs with return address branch prediction. +dnl +dnl The addl generated here has a backward reference to the label, and so +dnl won't suffer from the two forwards references bug in old gas (described +dnl in mpn/x86/README). + +define(movl_text_address, +m4_assert_numargs(2) +`ifdef(`PIC', + `call L(movl_text_address_`'movl_text_address_counter) +L(movl_text_address_`'movl_text_address_counter): + popl $2 C %eip + addl `$'$1-L(movl_text_address_`'movl_text_address_counter), $2 +define(`movl_text_address_counter',incr(movl_text_address_counter))', + `movl `$'$1, $2')') + +define(movl_text_address_counter,1) + + +dnl Usage: notl_or_xorl_GMP_NUMB_MASK(reg) +dnl +dnl Expand to either "notl `reg'" or "xorl $GMP_NUMB_BITS,`reg'" as +dnl appropriate for nails in use or not. + +define(notl_or_xorl_GMP_NUMB_MASK, +m4_assert_numargs(1) +`ifelse(GMP_NAIL_BITS,0, +`notl `$1'', +`xorl $GMP_NUMB_MASK, `$1'')') + + +dnl Usage LEA(symbol,reg) +dnl Usage LEAL(symbol_local_to_file,reg) + +define(`LEA', +m4_assert_numargs(2) +`ifdef(`PIC',`dnl +ifelse(index(defn(`load_eip'), `$2'),-1, +`m4append(`load_eip', +` TEXT + ALIGN(16) +L(movl_eip_`'substr($2,1)): + movl (%esp), $2 + ret_internal +')')dnl + call L(movl_eip_`'substr($2,1)) + addl $_GLOBAL_OFFSET_TABLE_, $2 + movl $1@GOT($2), $2 +',` + movl `$'$1, $2 +')') + +define(`LEAL', +m4_assert_numargs(2) +`ifdef(`PIC',`dnl +ifelse(index(defn(`load_eip'), `$2'),-1, +`m4append(`load_eip', +` TEXT + ALIGN(16) +L(movl_eip_`'substr($2,1)): + movl (%esp), $2 + ret_internal +')')dnl + call L(movl_eip_`'substr($2,1)) + addl $_GLOBAL_OFFSET_TABLE_, $2 + leal $1@GOTOFF($2), $2 +',` + movl `$'$1, $2 +')') + +dnl ASM_END + +define(`ASM_END',`load_eip') + +define(`load_eip', `') dnl updated in LEA/LEAL + + +define(`DEF_OBJECT', +m4_assert_numargs_range(1,2) + `RODATA + ALIGN(ifelse($#,1,2,$2)) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1) +` SIZE(`$1',.-`$1')') + +dnl Usage: CALL(funcname) +dnl + +define(`CALL', +m4_assert_numargs(1) +`ifdef(`PIC', + `call GSYM_PREFIX`'$1@PLT', + `call GSYM_PREFIX`'$1')') + +ifdef(`PIC', +`define(`PIC_WITH_EBX')', +`undefine(`PIC_WITH_EBX')') + +divert`'dnl diff --git a/gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h b/gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h new file mode 100644 index 0000000..8e6c052 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/zn1/gmp-mparam.h @@ -0,0 +1,220 @@ +/* AMD zn1/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3700-4300 MHz Pinnacle Ridge */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 14.00% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 248 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 91 +#define MUL_TOOM44_THRESHOLD 137 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 103 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 63 +#define SQR_TOOM3_THRESHOLD 98 +#define SQR_TOOM4_THRESHOLD 172 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 64 + +#define MULMOD_BNM1_THRESHOLD 21 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 606, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543, 8}, { 1087,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671, 9}, { 1343,11}, { 351,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 959,10}, { 1919,11}, { 991,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \ + { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2239,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2495,13}, { 1279,12}, { 2623,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,12}, { 7935,11}, { 15871,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,13}, { 7935,12}, { 15871,16} } +#define MUL_FFT_TABLE3_SIZE 172 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 464 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 464, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,11}, { 159, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415,12}, { 127,11}, { 255,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \ + { 3455,12}, { 959,11}, { 1919,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3839,12}, { 7679,13}, \ + { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \ + { 7935,16} } +#define SQR_FFT_TABLE3_SIZE 173 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 60 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 161 +#define SQRLO_SQR_THRESHOLD 9335 + +#define DC_DIV_QR_THRESHOLD 71 +#define DC_DIVAPPR_Q_THRESHOLD 206 +#define DC_BDIV_QR_THRESHOLD 63 +#define DC_BDIV_Q_THRESHOLD 126 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 274 +#define INV_APPR_THRESHOLD 228 + +#define BINV_NEWTON_THRESHOLD 274 +#define REDC_1_TO_REDC_N_THRESHOLD 71 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1589 + +#define POWM_SEC_TABLE 3,28,54,386,1337 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 262 +#define SET_STR_PRECOMPUTE_THRESHOLD 558 + +#define FAC_DSC_THRESHOLD 109 +#define FAC_ODD_THRESHOLD 39 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 1 /* 7.49% faster than 3 */ +#define HGCD_THRESHOLD 74 +#define HGCD_APPR_THRESHOLD 70 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 440 +#define GCDEXT_DC_THRESHOLD 327 +#define JACOBI_BASE_METHOD 1 /* 11.98% faster than 3 */ + +/* Tuneup completed successfully, took 36916 seconds */ diff --git a/gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h b/gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h new file mode 100644 index 0000000..152e6b7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/zn2/gmp-mparam.h @@ -0,0 +1,226 @@ +/* AMD zn2/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3600-4400 MHz Matisse */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 4.78% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 7 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 23 + +#define DIV_1_VS_MUL_1_PERCENT 274 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 85 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 290 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 26 +#define SQR_TOOM3_THRESHOLD 153 +#define SQR_TOOM4_THRESHOLD 214 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 48 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 24 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671, 8}, { 1343,10}, { 351, 9}, { 703,10}, \ + { 367, 9}, { 735,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415,11}, { 223,10}, { 447,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671, 9}, \ + { 1343,11}, { 351,10}, { 735,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471, 9}, \ + { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,12}, { 447,11}, { 959,10}, { 1919,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,10}, { 2943,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \ + { 3455,12}, { 959,11}, { 1919,10}, { 3839,14}, \ + { 255,13}, { 511,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2943,11}, { 5887,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3839,12}, { 7679,13}, { 3967,12}, { 7935,11}, \ + { 15871,15}, { 1023,14}, { 2047,13}, { 4351,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,13}, { 7935,12}, \ + { 15871,16} } +#define MUL_FFT_TABLE3_SIZE 189 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 47,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 95,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \ + { 287, 8}, { 607, 7}, { 1215,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543, 8}, { 1087, 9}, { 607, 8}, \ + { 1215,11}, { 159, 9}, { 671, 8}, { 1343,10}, \ + { 351, 9}, { 735, 8}, { 1471,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,10}, \ + { 607, 9}, { 1215, 8}, { 2431,10}, { 671, 9}, \ + { 1343,10}, { 735, 9}, { 1471,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215, 9}, { 2431,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471, 9}, { 2943,12}, \ + { 383,11}, { 863,12}, { 447,11}, { 959,10}, \ + { 1919,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,10}, { 2943, 9}, \ + { 5887,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,12}, { 959,11}, { 1919,10}, { 3839,14}, \ + { 255,13}, { 511,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,11}, { 5887,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \ + { 511,14}, { 1023,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3839,12}, { 7679,13}, { 3967,12}, \ + { 7935,11}, { 15871,15}, { 1023,14}, { 2047,13}, \ + { 4223,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \ + { 7935,12}, { 15871,16} } +#define SQR_FFT_TABLE3_SIZE 178 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 4 +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 107 +#define SQRLO_SQR_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 206 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 136 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 212 +#define INV_APPR_THRESHOLD 204 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 97 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,16,96,386,1555 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 303 +#define SET_STR_PRECOMPUTE_THRESHOLD 748 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 55 + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 1 /* 14.03% faster than 3 */ +#define HGCD_THRESHOLD 103 +#define HGCD_APPR_THRESHOLD 127 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 396 +#define GCDEXT_DC_THRESHOLD 265 +#define JACOBI_BASE_METHOD 1 /* 47.88% faster than 4 */ + +/* Tuneup completed successfully, took 29014 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/README b/gmp-6.3.0/mpn/x86_64/README new file mode 100644 index 0000000..9c8a586 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/README @@ -0,0 +1,74 @@ +Copyright 2003, 2004, 2006, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + + AMD64 MPN SUBROUTINES + + +This directory contains mpn functions for AMD64 chips. It is also useful +for 64-bit Pentiums, and "Core 2". + + + RELEVANT OPTIMIZATION ISSUES + +The Opteron and Athlon64 can sustain up to 3 instructions per cycle, but in +practice that is only possible for integer instructions. But almost any +three integer instructions can issue simultaneously, including any 3 ALU +operations, including shifts. Up to two memory operations can issue each +cycle. + +Scheduling typically requires that load-use instructions are split into +separate load and use instructions. That requires more decode resources, +and it is rarely a win. Opteron/Athlon64 have deep out-of-order core. + + +Optimizing for 64-bit Pentium4 is probably a waste of time, as the most +critical instructions are very poorly implemented here. Perhaps we could +save a cycle or two, but the most common loops now run at between 10 and 22 +cycles, so a saved cycle isn't too exciting. + + +The new spin of the venerable P6 core, the "Core 2" is much better than the +Pentium4 for the GMP loops. Its integer pipeline is somewhat similar to to +the Opteron/Athlon64 pipeline, except that the GMP favourites ADC/SBB and +MUL are slower. Furthermore, an INC/DEC followed by ADC/SBB incur a +pipeline stall of around 10 cycles. The default mpn_add_n and mpn_sub_n +code suffers badly from the stall. The code in the core2 subdirectory uses +the almost forgotten instruction JRCXZ for loop control, and updates the +induction variable using LEA. + + + +REFERENCES + +"System V Application Binary Interface AMD64 Architecture Processor +Supplement", draft version 0.99, December 2007. +http://www.x86-64.org/documentation/abi.pdf diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm new file mode 100644 index 0000000..d105da6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/alderlake/addmul_1.asm @@ -0,0 +1,168 @@ +dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2022 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 - +C AMD zn1 ? +C AMD zn2 ? +C AMD zn3 ? +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL - +C Intel BWL ? +C Intel SKL ? +C Intel RKL ? +C Intel ALD 1.29 +C Intel atom - +C Intel SLM - +C Intel GLM - +C VIA nano - + +define(`rp', `%rdi') dnl rcx +define(`up', `%rsi') dnl rdx +define(`n_param', `%rdx') dnl r8 +define(`v0_param',`%rcx') dnl r9 + +define(`n', `%rcx') dnl +define(`v0', `%rdx') dnl + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_1) + mov (up), %r8 + + push %rbx + push %r12 + push %r13 + + mov %rdx, %rax + mov %rcx, v0 + mov %rax, n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jl L(b1) + jz L(b2) + +L(b3): mulx( %r8, %r11, %r10) + mulx( 8,(up), %r13, %r12) + mulx( 16,(up), %rbx, %rax) + inc n + lea -8(up), up + lea -24(rp), rp + jmp L(lo3) + +L(b0): mulx( %r8, %r9, %r8) + mulx( 8,(up), %r11, %r10) + mulx( 16,(up), %r13, %r12) + lea -16(rp), rp + jmp L(lo0) + +L(b2): mulx( %r8, %r13, %r12) + mulx( 8,(up), %rbx, %rax) + lea -2(n), n + jrcxz L(n2) + mulx( 16,(up), %r9, %r8) + lea 16(up), up + jmp L(lo2) +L(n2): jmp L(wd2) + +L(b1): mulx( %r8, %rbx, %rax) + sub $1, n + jrcxz L(n1) + mulx( 8,(up), %r9, %r8) + mulx( 16,(up), %r11, %r10) + lea 8(up), up + lea -8(rp), rp + jmp L(lo1) +L(n1): add (rp), %rbx + adc %rcx, %rax + mov %rbx, (rp) + pop %r13 + pop %r12 + pop %rbx + ret + +L(top): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) +L(lo2): adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) +L(lo1): adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) +L(lo0): adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) +L(lo3): adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea -4(n), n + jrcxz L(end) + jmp L(top) + +L(end): adcx( %r10, %r13) + mov %r11, -8(rp) +L(wd2): adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + pop %r13 + pop %r12 + pop %rbx + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h new file mode 100644 index 0000000..0bffc3d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/alderlake/gmp-mparam.h @@ -0,0 +1,225 @@ +/* Intel Alder Lake gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3700-4900 MHz Alder Lake */ +/* FFT tuning limit = 10,000,000 */ +/* Generated by tuneup.c, 2022-03-15, gcc 11.2 */ + +#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 23 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 34 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 30 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 23 + +#define DIV_1_VS_MUL_1_PERCENT 559 + +#define MUL_TOOM22_THRESHOLD 13 +#define MUL_TOOM33_THRESHOLD 97 +#define MUL_TOOM44_THRESHOLD 148 +#define MUL_TOOM6H_THRESHOLD 562 +#define MUL_TOOM8H_THRESHOLD 608 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 259 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 98 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 144 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 86 +#define SQR_TOOM4_THRESHOLD 582 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 753 + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 384 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 384, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 24, 7}, { 24, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,10}, { 135,11}, { 79, 9}, { 319, 8}, \ + { 639, 9}, { 335, 8}, { 671,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543, 8}, { 1087, 9}, { 575,10}, \ + { 303, 9}, { 607,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 575,11}, { 303,10}, \ + { 607, 9}, { 1215,11}, { 319,10}, { 671,11}, \ + { 351,10}, { 703,11}, { 367,10}, { 735, 9}, \ + { 1471, 8}, { 2943,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087, 9}, { 2175,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,10}, { 1471, 9}, \ + { 2943, 8}, { 5887,12}, { 383,11}, { 767,10}, \ + { 1535,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,10}, { 1791,12}, { 479,11}, \ + { 959,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,11}, { 1471,10}, \ + { 2943,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,13}, { 447,12}, { 959,11}, { 1919,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1471,11}, \ + { 2943,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,12}, { 1919,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 190 +#define MUL_FFT_THRESHOLD 2496 + +#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 344, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511,11}, { 79, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303, 9}, \ + { 607,10}, { 319, 9}, { 639,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,10}, { 607,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,12}, { 287,11}, { 575,10}, \ + { 1151,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \ + { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 447,11}, \ + { 895,12}, { 479,11}, { 959,10}, { 1919,14}, \ + { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \ + { 607,11}, { 1215,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,13}, { 447,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 166 +#define SQR_FFT_THRESHOLD 2240 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 56 +#define MULLO_MUL_N_THRESHOLD 4940 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 73 +#define SQRLO_SQR_THRESHOLD 4392 + +#define DC_DIV_QR_THRESHOLD 19 +#define DC_DIVAPPR_Q_THRESHOLD 139 +#define DC_BDIV_QR_THRESHOLD 62 +#define DC_BDIV_Q_THRESHOLD 126 + +#define INV_MULMOD_BNM1_THRESHOLD 24 +#define INV_NEWTON_THRESHOLD 108 +#define INV_APPR_THRESHOLD 108 + +#define BINV_NEWTON_THRESHOLD 208 +#define REDC_1_TO_REDC_2_THRESHOLD 36 +#define REDC_2_TO_REDC_N_THRESHOLD 53 + +#define MU_DIV_QR_THRESHOLD 855 +#define MU_DIVAPPR_Q_THRESHOLD 1120 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,11,70,702,2499 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 2150 +#define SET_STR_PRECOMPUTE_THRESHOLD 2943 + +#define FAC_DSC_THRESHOLD 298 +#define FAC_ODD_THRESHOLD 51 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 1 /* 2.38% faster than 3 */ +#define HGCD_THRESHOLD 142 +#define HGCD_APPR_THRESHOLD 159 +#define HGCD_REDUCE_THRESHOLD 2384 +#define GCD_DC_THRESHOLD 483 +#define GCDEXT_DC_THRESHOLD 492 +#define JACOBI_BASE_METHOD 1 /* 0.94% faster than 3 */ diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm new file mode 100644 index 0000000..9400fe5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/alderlake/mul_basecase.asm @@ -0,0 +1,474 @@ +dnl AMD64 mpn_mul_basecase. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2022 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 - +C AMD zn1 ? +C AMD zn2 ? +C AMD zn3 ? +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL - +C Intel BWL ? +C Intel SKL ? +C Intel RKL ? +C Intel ALD 1.29 +C Intel atom - +C Intel SLM - +C Intel GLM - +C VIA nano - + +C TODO +C * Do overlapped software pipelining. +C * Try shallower pipeline, which would result in using fewer registers. +C * There are false dependencies on CF/OF between iterations. Try breaking +C them to see if it helps. + +define(`rp', `%rdi') dnl rcx +define(`up', `%rsi') dnl rdx +define(`un_arg',`%rdx') dnl r8 +define(`vp_arg',`%rcx') dnl r9 +define(`vn_arg',`%r8') dnl stack + +define(`un', `%r14') +define(`vp', `%r15') +define(`vn', `%rbp') + +define(`n', `%rcx') +define(`v0', `%rdx') + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + cmp $2, un_arg + ja L(gen) + mov (vp_arg), %rdx + mulx( (up), %rax, %r9) + mov %rax, (rp) + je L(s2x) + + mov %r9, 8(rp) + ret + +L(s2x): mulx( 8,(up), %rax, %r10) + add %r9, %rax + adc $0, %r10 + cmp $2, R32(vn_arg) + je L(s22) + +L(s21): mov %rax, 8(rp) + mov %r10, 16(rp) + ret + +L(s22): mov 8(vp_arg), %rdx + mulx( (up), %r8, %r9) + add %r8, %rax + adc %r10, %r9 + mov %rax, 8(rp) + mulx( 8,(up), %rax, %r10) + adc $0, %r10 + adc %r9, %rax + mov %rax, 16(rp) + adc $0, %r10 + mov %r10, 24(rp) + ret + +L(gen): push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov un_arg, un + neg un + shl $3, un + mov vp_arg, vp + mov vn_arg, vn + + test $1, R8(un_arg) + mov (vp), %rdx + jz L(bx0) + +L(bx1): test $16, R8(un) + jnz L(b01) + +L(b11): lea 24(un), n + mulx( (up), %r11, %r10) + mulx( 8,(up), %r13, %r12) + mulx( 16,(up), %rbx, %rax) + lea 8(rp), rp + lea 24(up), up + jrcxz L(med3) +L(mtp3):mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(med3) + jmp L(mtp3) +L(med3):adcx( %r10, %r13) + mov %r11, -8(rp) + adcx( %r12, %rbx) + mov %r13, (rp) + adcx( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jz L(ret) +L(out3):lea 32(rp,un), rp + lea 24(up,un), up + lea 8(vp), vp + xor R32(%rdx), R32(%rdx) + mov (vp), %rdx + mulx( -24,(up), %r11, %r10) + mulx( -16,(up), %r13, %r12) + mulx( -8,(up), %rbx, %rax) + lea 24(un), n + adox( -8,(rp), %r11) + jrcxz L(ed3) +L(tp3): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(ed3) + jmp L(tp3) +L(ed3): adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jnz L(out3) + jmp L(ret) + + +L(b01): mulx( (up), %rbx, %rax) + lea 8(un), n + mulx( 8,(up), %r9, %r8) + mulx( 16,(up), %r11, %r10) + lea 8(up), up + lea -8(rp), rp + jmp L(ml1) +L(mtp1):mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) +L(ml1): mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(med1) + jmp L(mtp1) +L(med1):adcx( %r10, %r13) + mov %r11, -8(rp) + adcx( %r12, %rbx) + mov %r13, (rp) + adcx( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jz L(ret) +L(out1):lea 16(rp,un), rp + lea 8(up,un), up + lea 8(vp), vp + xor R32(%rdx), R32(%rdx) + mov (vp), %rdx + lea 8(un), n + mulx( -8,(up), %rbx, %rax) + mulx( (up), %r9, %r8) + mulx( 8,(up), %r11, %r10) + jmp L(lo1) +L(tp1): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) +L(lo1): adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(ed1) + jmp L(tp1) +L(ed1): adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jnz L(out1) + jmp L(ret) + + +L(bx0): test $16, R8(un) + jz L(b00) + +L(b10): mulx( (up), %r13, %r12) + mulx( 8,(up), %rbx, %rax) + lea 16(un), n + mulx( 16,(up), %r9, %r8) + lea 16(up), up + jmp L(ml2) +L(mtp2):mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) +L(ml2): mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(med2) + jmp L(mtp2) +L(med2):adcx( %r10, %r13) + mov %r11, -8(rp) + adcx( %r12, %rbx) + mov %r13, (rp) + adcx( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jz L(ret) +L(out2):lea 24(rp,un), rp + lea 16(up,un), up + lea 8(vp), vp + xor R32(%rdx), R32(%rdx) + mov (vp), %rdx + mulx( -16,(up), %r13, %r12) + mulx( -8,(up), %rbx, %rax) + lea 16(un), n + mulx( (up), %r9, %r8) + jmp L(lo2) +L(tp2): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) +L(lo2): adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(ed2) + jmp L(tp2) +L(ed2): adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jnz L(out2) + jmp L(ret) + + +L(b00): lea 32(un), n + mulx( (up), %r9, %r8) + mulx( 8,(up), %r11, %r10) + mulx( 16,(up), %r13, %r12) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, (rp) + lea 32(up), up + lea 16(rp), rp + jrcxz L(med0) +L(mtp0):mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(med0) + jmp L(mtp0) +L(med0):adcx( %r10, %r13) + mov %r11, -8(rp) + adcx( %r12, %rbx) + mov %r13, (rp) + adcx( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jz L(ret) +L(out0):lea 40(rp,un), rp + lea 32(up,un), up + lea 8(vp), vp + xor R32(%rdx), R32(%rdx) + mov (vp), %rdx + lea 32(un), n + mulx( -32,(up), %r9, %r8) + mulx( -24,(up), %r11, %r10) + mulx( -16,(up), %r13, %r12) + adox( -16,(rp), %r9) + mulx( -8,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, -16(rp) + adox( -8,(rp), %r11) + jrcxz L(ed0) +L(tp0): mulx( (up), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + mulx( 8,(up), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + mulx( 16,(up), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp) + adox( 16,(rp), %r9) + mulx( 24,(up), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp) + adox( 24,(rp), %r11) + lea 32(up), up + lea 32(rp), rp + lea 32(n), n + jrcxz L(ed0) + jmp L(tp0) +L(ed0): adcx( %r10, %r13) + mov %r11, -8(rp) + adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) + adox( 8,(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + mov %rax, 16(rp) + dec vn + jnz L(out0) + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm b/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm new file mode 100644 index 0000000..d7d6b0d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/alderlake/submul_1.asm @@ -0,0 +1,140 @@ +dnl AMD64 mpn_submul_1 for CPUs with mulx and adx. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2022 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 - +C AMD zn1 ? +C AMD zn2 ? +C AMD zn3 2.0 +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL - +C Intel BWL ? +C Intel SKL ? +C Intel RKL 2.0 +C Intel ALD 1.53 +C Intel atom - +C Intel SLM - +C Intel GLM - +C VIA nano - + +define(`rp', `%rdi') dnl rcx +define(`up', `%rsi') dnl rdx +define(`n_param', `%rdx') dnl r8 +define(`v0_param',`%rcx') dnl r9 + +define(`n', `%rcx') dnl +define(`v0', `%rdx') dnl + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_submul_1) + mov n_param, %rax + mov v0_param, v0 + mov %rax, n + test $1, R8(n) + jz L(bx0) + +L(bx1): mulx( (up), %r9, %rax) + test $2, R8(n) + stc + jz L(b01) + +L(b11): lea 1(n), n + lea 16(up), up + lea 16(rp), rp + jmp L(lo3) + +L(b01): lea 3(n), n + jmp L(lo1) + +L(bx0): mulx( (up), %r9, %r8) + test $2, R8(n) + stc + jz L(b00) + +L(b10): lea 8(up), up + lea 8(rp), rp + lea 2(n), n + jmp L(lo2) + +L(b00): lea 24(up), up + lea 24(rp), rp + jmp L(lo0) + +L(top): lea 32(up), up + lea 32(rp), rp + mulx( -24,(up), %r9, %r8) + adox( %rax, %r9) +L(lo0): not %r9 + adcx( -24,(rp), %r9) + mov %r9, -24(rp) + mulx( -16,(up), %r9, %rax) + adox( %r8, %r9) +L(lo3): not %r9 + adcx( -16,(rp), %r9) + mov %r9, -16(rp) + mulx( -8,(up), %r9, %r8) + adox( %rax, %r9) +L(lo2): not %r9 + adcx( -8,(rp), %r9) + mov %r9, -8(rp) + mulx( (up), %r9, %rax) + adox( %r8, %r9) +L(lo1): not %r9 + adcx( (rp), %r9) + mov %r9, (rp) + lea -4(n), n + jrcxz L(end) + jmp L(top) + +L(end): adox( %rcx, %rax) + sbb $-1, %rax + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm new file mode 100644 index 0000000..6ee0872 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorrlsh1_n.asm @@ -0,0 +1,170 @@ +dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) +dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 2 +C AMD bd1 ? +C AMD bobcat ? +C Intel P4 13 +C Intel core2 3.45 +C Intel NHM ? +C Intel SBR ? +C Intel atom ? +C VIA nano ? + + +C Sometimes speed degenerates, supposedly related to that some operand +C alignments cause cache conflicts. + +C The speed is limited by decoding/issue bandwidth. There are 22 instructions +C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l. + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbp + + mov (vp), %r8 + mov R32(n), R32(%rax) + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + neg n + xor R32(%rbp), R32(%rbp) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): add %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + mov 16(vp,n,8), %r10 + adc %r10, %r10 + sbb R32(%rax), R32(%rax) C save scy + ADDSUB (up,n,8), %r8 + ADCSBB 8(up,n,8), %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) + ADCSBB 16(up,n,8), %r10 + mov %r10, 16(rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + add $3, n + jmp L(ent) + +L(b10): add %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + sbb R32(%rax), R32(%rax) C save scy + ADDSUB (up,n,8), %r8 + ADCSBB 8(up,n,8), %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + add $2, n + jmp L(ent) + +L(b01): add %r8, %r8 + sbb R32(%rax), R32(%rax) C save scy + ADDSUB (up,n,8), %r8 + mov %r8, (rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + inc n +L(ent): jns L(end) + + ALIGN(16) +L(top): add R32(%rax), R32(%rax) C restore scy + + mov (vp,n,8), %r8 +L(b00): adc %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + mov 16(vp,n,8), %r10 + adc %r10, %r10 + mov 24(vp,n,8), %r11 + adc %r11, %r11 + + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + + ADCSBB (up,n,8), %r8 + nop C Hammer speedup! + ADCSBB 8(up,n,8), %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) + ADCSBB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov %r10, 16(rp,n,8) + mov %r11, 24(rp,n,8) + + sbb R32(%rbp), R32(%rbp) C save acy + add $4, n + js L(top) + +L(end): +ifdef(`OPERATION_addlsh1_n',` + add R32(%rbp), R32(%rax) + neg R32(%rax)') +ifdef(`OPERATION_rsblsh1_n',` + sub R32(%rax), R32(%rbp) + movslq R32(%rbp), %rax') + + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm new file mode 100644 index 0000000..999e972 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorrlsh2_n.asm @@ -0,0 +1,53 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009-2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_rsblsh2_n',` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh2_n)') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm new file mode 100644 index 0000000..de00154 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorrlshC_n.asm @@ -0,0 +1,172 @@ +dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C) +dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[] + +dnl Copyright 2009-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +C cycles/limb +C AMD K8,K9 2.1 +C AMD K10 2.0 +C AMD bd1 ~2.7 +C AMD bd2 ~2.7 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 2.0 +C AMD bt1 3.3 +C AMD bt2 3.0 +C Intel P4 ? +C Intel PNR 3.0 +C Intel NHM 2.75 +C Intel SBR 2.55 +C Intel IBR 2.49 +C Intel HWL 2.25 +C Intel BWL 1.89 +C Intel SKL 1.90 +C Intel atom 8.4 +C Intel SLM 4.0 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +define(M, eval(m4_lshift(1,LSH))) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %r12 + push %r13 + push %r14 + push %r15 + + mov (vp), %r8 + lea (,%r8,M), %r12 + shr $RSH, %r8 + + mov R32(n), R32(%rax) + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + neg n + and $3, R8(%rax) + je L(b00) + cmp $2, R8(%rax) + jc L(b01) + je L(b10) + +L(b11): mov 8(vp,n,8), %r10 + lea (%r8,%r10,M), %r14 + shr $RSH, %r10 + mov 16(vp,n,8), %r11 + lea (%r10,%r11,M), %r15 + shr $RSH, %r11 + ADDSUB (up,n,8), %r12 + ADCSBB 8(up,n,8), %r14 + ADCSBB 16(up,n,8), %r15 + sbb R32(%rax), R32(%rax) C save carry for next + mov %r12, (rp,n,8) + mov %r14, 8(rp,n,8) + mov %r15, 16(rp,n,8) + add $3, n + js L(top) + jmp L(end) + +L(b01): mov %r8, %r11 + ADDSUB (up,n,8), %r12 + sbb R32(%rax), R32(%rax) C save carry for next + mov %r12, (rp,n,8) + add $1, n + js L(top) + jmp L(end) + +L(b10): mov 8(vp,n,8), %r11 + lea (%r8,%r11,M), %r15 + shr $RSH, %r11 + ADDSUB (up,n,8), %r12 + ADCSBB 8(up,n,8), %r15 + sbb R32(%rax), R32(%rax) C save carry for next + mov %r12, (rp,n,8) + mov %r15, 8(rp,n,8) + add $2, n + js L(top) + jmp L(end) + +L(b00): mov 8(vp,n,8), %r9 + mov 16(vp,n,8), %r10 + jmp L(e00) + + ALIGN(16) +L(top): mov 16(vp,n,8), %r10 + mov (vp,n,8), %r8 + mov 8(vp,n,8), %r9 + lea (%r11,%r8,M), %r12 + shr $RSH, %r8 +L(e00): lea (%r8,%r9,M), %r13 + shr $RSH, %r9 + mov 24(vp,n,8), %r11 + lea (%r9,%r10,M), %r14 + shr $RSH, %r10 + lea (%r10,%r11,M), %r15 + shr $RSH, %r11 + add R32(%rax), R32(%rax) C restore carry + ADCSBB (up,n,8), %r12 + ADCSBB 8(up,n,8), %r13 + ADCSBB 16(up,n,8), %r14 + ADCSBB 24(up,n,8), %r15 + mov %r12, (rp,n,8) + mov %r13, 8(rp,n,8) + mov %r14, 16(rp,n,8) + sbb R32(%rax), R32(%rax) C save carry for next + mov %r15, 24(rp,n,8) + add $4, n + js L(top) +L(end): + +ifelse(ADDSUB,add,` + sub R32(%r11), R32(%rax) + neg R32(%rax) +',` + add R32(%r11), R32(%rax) + movslq R32(%rax), %rax +') + pop %r15 + pop %r14 + pop %r13 + pop %r12 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm new file mode 100644 index 0000000..5ca128f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 3.1 < 3.85 for lshift + add_n +C AMD K10 3.1 < 3.85 for lshift + add_n +C Intel P4 14.6 > 7.33 for lshift + add_n +C Intel core2 3.87 > 3.27 for lshift + add_n +C Intel NHM 4 > 3.75 for lshift + add_n +C Intel SBR (5.8) > 3.46 for lshift + add_n +C Intel atom (7.75) < 8.75 for lshift + add_n +C VIA nano 4.7 < 6.25 for lshift + add_n + +C This was written quickly and not optimized at all. Surely one could get +C closer to 3 c/l or perhaps even under 3 c/l. Ideas: +C 1) Use indexing to save the 3 LEA +C 2) Write reasonable feed-in code +C 3) Be more clever about register usage +C 4) Unroll more, handling CL negation, carry save/restore cost much now +C 5) Reschedule + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') + +ifdef(`OPERATION_addlsh_n',` + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %r12 + push %r13 + push %r14 + push %rbp + push %rbx + + mov n, %rax + xor R32(%rbx), R32(%rbx) C clear carry save register + mov R32(%r8), R32(%rcx) C shift count + xor R32(%rbp), R32(%rbp) C limb carry + + mov R32(%rax), R32(%r11) + and $3, R32(%r11) + je L(4) + sub $1, R32(%r11) + +L(012): mov (vp), %r8 + mov %r8, %r12 + shl R8(%rcx), %r8 + or %rbp, %r8 + neg R8(%rcx) + mov %r12, %rbp + shr R8(%rcx), %rbp + neg R8(%rcx) + add R32(%rbx), R32(%rbx) + ADCSBB (up), %r8 + mov %r8, (rp) + sbb R32(%rbx), R32(%rbx) + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sub $1, R32(%r11) + jnc L(012) + +L(4): sub $4, %rax + jc L(end) + + ALIGN(16) +L(top): mov (vp), %r8 + mov %r8, %r12 + mov 8(vp), %r9 + mov %r9, %r13 + mov 16(vp), %r10 + mov %r10, %r14 + mov 24(vp), %r11 + + shl R8(%rcx), %r8 + shl R8(%rcx), %r9 + shl R8(%rcx), %r10 + or %rbp, %r8 + mov %r11, %rbp + shl R8(%rcx), %r11 + + neg R8(%rcx) + + shr R8(%rcx), %r12 + shr R8(%rcx), %r13 + shr R8(%rcx), %r14 + shr R8(%rcx), %rbp C used next iteration + + or %r12, %r9 + or %r13, %r10 + or %r14, %r11 + + neg R8(%rcx) + + add R32(%rbx), R32(%rbx) C restore carry flag + + ADCSBB (up), %r8 + ADCSBB 8(up), %r9 + ADCSBB 16(up), %r10 + ADCSBB 24(up), %r11 + + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + + sbb R32(%rbx), R32(%rbx) C save carry flag + + lea 32(up), up + lea 32(vp), vp + lea 32(rp), rp + + sub $4, %rax + jnc L(top) + +L(end): add R32(%rbx), R32(%rbx) + ADCSBB $0, %rbp + mov %rbp, %rax + pop %rbx + pop %rbp + pop %r14 + pop %r13 + pop %r12 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm new file mode 100644 index 0000000..54d0b3f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aors_err1_n.asm @@ -0,0 +1,225 @@ +dnl AMD64 mpn_add_err1_n, mpn_sub_err1_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.75 (degenerates to 3 c/l for some alignments) +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp', `%r8') +define(`n', `%r9') +define(`cy_param', `8(%rsp)') + +define(`el', `%rbx') +define(`eh', `%rbp') +define(`t0', `%r10') +define(`t1', `%r11') +define(`t2', `%r12') +define(`t3', `%r13') +define(`w0', `%r14') +define(`w1', `%r15') + +ifdef(`OPERATION_add_err1_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err1_n)') +ifdef(`OPERATION_sub_err1_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err1_n)') + +MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + + mov R32(n), R32(%r10) + and $3, R32(%r10) + jz L(0mod4) + cmp $2, R32(%r10) + jc L(1mod4) + jz L(2mod4) +L(3mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + xor R32(t1), R32(t1) + lea -24(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 16(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc 8(yp), t0 + mov 16(up,n,8), w0 + ADCSBB 16(vp,n,8), w0 + mov w0, 16(rp,n,8) + cmovc (yp), t1 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + + add $3, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(0mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea (yp,n,8), yp + neg n + jmp L(loop) + + ALIGN(16) +L(1mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea -8(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc (yp), el + setc %al C save carry + + add $1, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(2mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + lea -16(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 8(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc (yp), t0 + setc %al C save carry + add t0, el + adc $0, eh + + add $2, n + jnz L(loop) + jmp L(end) + + ALIGN(32) +L(loop): + shr $1, %al C restore carry + mov -8(yp), t0 + mov $0, R32(t3) + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + cmovnc t3, t0 + ADCSBB 8(vp,n,8), w1 + mov -16(yp), t1 + mov w0, (rp,n,8) + mov 16(up,n,8), w0 + mov w1, 8(rp,n,8) + cmovnc t3, t1 + mov -24(yp), t2 + ADCSBB 16(vp,n,8), w0 + cmovnc t3, t2 + mov 24(up,n,8), w1 + ADCSBB 24(vp,n,8), w1 + cmovc -32(yp), t3 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + add t2, el + adc $0, eh + mov w0, 16(rp,n,8) + add t3, el + lea -32(yp), yp + adc $0, eh + mov w1, 24(rp,n,8) + add $4, n + jnz L(loop) + +L(end): + mov el, (ep) + mov eh, 8(ep) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm new file mode 100644 index 0000000..ce5c2a4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aors_err2_n.asm @@ -0,0 +1,172 @@ +dnl AMD64 mpn_add_err2_n, mpn_sub_err2_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 ? +C Intel P4 ? +C Intel core2 6.9 +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp1', `%r8') +define(`yp2', `%r9') +define(`n_param', `8(%rsp)') +define(`cy_param', `16(%rsp)') + +define(`cy1', `%r14') +define(`cy2', `%rax') + +define(`n', `%r10') + +define(`w', `%rbx') +define(`e1l', `%rbp') +define(`e1h', `%r11') +define(`e2l', `%r12') +define(`e2h', `%r13') + + +ifdef(`OPERATION_add_err2_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err2_n)') +ifdef(`OPERATION_sub_err2_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err2_n)') + +MULFUNC_PROLOGUE(mpn_add_err2_n mpn_sub_err2_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, cy2 + mov n_param, n + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + xor R32(e1l), R32(e1l) + xor R32(e1h), R32(e1h) + xor R32(e2l), R32(e2l) + xor R32(e2h), R32(e2h) + + sub yp1, yp2 + + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + + test $1, n + jnz L(odd) + + lea -8(yp1,n,8), yp1 + neg n + jmp L(top) + + ALIGN(16) +L(odd): + lea -16(yp1,n,8), yp1 + neg n + shr $1, cy2 + mov (up,n,8), w + ADCSBB (vp,n,8), w + cmovc 8(yp1), e1l + cmovc 8(yp1,yp2), e2l + mov w, (rp,n,8) + sbb cy2, cy2 + inc n + jz L(end) + + ALIGN(16) +L(top): + mov (up,n,8), w + shr $1, cy2 C restore carry + ADCSBB (vp,n,8), w + mov w, (rp,n,8) + sbb cy1, cy1 C generate mask, preserve CF + + mov 8(up,n,8), w + ADCSBB 8(vp,n,8), w + mov w, 8(rp,n,8) + sbb cy2, cy2 C generate mask, preserve CF + + mov (yp1), w C (e1h:e1l) += cy1 * yp1 limb + and cy1, w + add w, e1l + adc $0, e1h + + and (yp1,yp2), cy1 C (e2h:e2l) += cy1 * yp2 limb + add cy1, e2l + adc $0, e2h + + mov -8(yp1), w C (e1h:e1l) += cy2 * next yp1 limb + and cy2, w + add w, e1l + adc $0, e1h + + mov -8(yp1,yp2), w C (e2h:e2l) += cy2 * next yp2 limb + and cy2, w + add w, e2l + adc $0, e2h + + add $2, n + lea -16(yp1), yp1 + jnz L(top) +L(end): + + mov e1l, (ep) + mov e1h, 8(ep) + mov e2l, 16(ep) + mov e2h, 24(ep) + + and $1, %eax C return carry + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm b/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm new file mode 100644 index 0000000..bb6d0c5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aors_err3_n.asm @@ -0,0 +1,156 @@ +dnl AMD64 mpn_add_err3_n, mpn_sub_err3_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 7.0 +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel corei ? +C Intel atom ? +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp1', `%r8') +define(`yp2', `%r9') +define(`yp3_param', `8(%rsp)') +define(`n_param', `16(%rsp)') +define(`cy_param', `24(%rsp)') + +define(`n', `%r10') +define(`yp3', `%rcx') +define(`t', `%rbx') + +define(`e1l', `%rbp') +define(`e1h', `%r11') +define(`e2l', `%r12') +define(`e2h', `%r13') +define(`e3l', `%r14') +define(`e3h', `%r15') + + + +ifdef(`OPERATION_add_err3_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err3_n)') +ifdef(`OPERATION_sub_err3_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err3_n)') + +MULFUNC_PROLOGUE(mpn_add_err3_n mpn_sub_err3_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, %rax + mov n_param, n + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + push ep + mov 64(%rsp), yp3 C load from yp3_param + + xor R32(e1l), R32(e1l) + xor R32(e1h), R32(e1h) + xor R32(e2l), R32(e2l) + xor R32(e2h), R32(e2h) + xor R32(e3l), R32(e3l) + xor R32(e3h), R32(e3h) + + sub yp1, yp2 + sub yp1, yp3 + + lea -8(yp1,n,8), yp1 + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + neg n + + ALIGN(16) +L(top): + shr $1, %rax C restore carry + mov (up,n,8), %rax + ADCSBB (vp,n,8), %rax + mov %rax, (rp,n,8) + sbb %rax, %rax C save carry and generate mask + + mov (yp1), t + and %rax, t + add t, e1l + adc $0, e1h + + mov (yp1,yp2), t + and %rax, t + add t, e2l + adc $0, e2h + + mov (yp1,yp3), t + and %rax, t + add t, e3l + adc $0, e3h + + lea -8(yp1), yp1 + inc n + jnz L(top) + +L(end): + and $1, %eax + pop ep + + mov e1l, (ep) + mov e1h, 8(ep) + mov e2l, 16(ep) + mov e2h, 24(ep) + mov e3l, 32(ep) + mov e3h, 40(ep) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aors_n.asm b/gmp-6.3.0/mpn/x86_64/aors_n.asm new file mode 100644 index 0000000..d5a314a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aors_n.asm @@ -0,0 +1,178 @@ +dnl AMD64 mpn_add_n, mpn_sub_n + +dnl Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1.5 +C AMD K10 1.5 +C AMD bd1 1.8 +C AMD bd2 1.74 +C AMD bd3 ? +C AMD bd4 1.78 +C AMD zen 1.5 +C AMD bt1 2.54 +C AMD bt2 2.15 +C Intel P4 11.5 +C Intel core2 4.9 +C Intel NHM 5.53 +C Intel SBR 1.59 +C Intel IBR 1.55 +C Intel HWL 1.44 +C Intel BWL 1.14 +C Intel SKL 1.21 +C Intel atom 4 +C Intel SLM 3 +C VIA nano 3.25 + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C INPUT PARAMETERS +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + mov R32(n), R32(%rax) + shr $2, n + and $3, R32(%rax) + bt $0, %r8 C cy flag <- carry parameter + jrcxz L(lt4) + + mov (up), %r8 + mov 8(up), %r9 + dec n + jmp L(mid) + +EPILOGUE() + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + mov R32(n), R32(%rax) + shr $2, n + and $3, R32(%rax) + jrcxz L(lt4) + + mov (up), %r8 + mov 8(up), %r9 + dec n + jmp L(mid) + +L(lt4): dec R32(%rax) + mov (up), %r8 + jnz L(2) + ADCSBB (vp), %r8 + mov %r8, (rp) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret + +L(2): dec R32(%rax) + mov 8(up), %r9 + jnz L(3) + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + mov %r8, (rp) + mov %r9, 8(rp) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret + +L(3): mov 16(up), %r10 + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + setc R8(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(top): ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + mov %r8, (rp) + lea 32(up), up + mov %r9, 8(rp) + mov %r10, 16(rp) + dec n + mov %r11, 24(rp) + lea 32(vp), vp + mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp +L(mid): mov 16(up), %r10 + mov 24(up), %r11 + jnz L(top) + +L(end): lea 32(up), up + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + lea 32(vp), vp + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + lea 32(rp), rp + + inc R32(%rax) + dec R32(%rax) + jnz L(lt4) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm new file mode 100644 index 0000000..dfe4dc4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/aorsmul_1.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.52 +C AMD K10 2.51 +C AMD bd1 4.43 +C AMD bd2 5.03 5.63 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bobcat 6.20 +C AMD jaguar 5.57 6.56 +C Intel P4 14.9 17.1 +C Intel core2 5.15 +C Intel NHM 4.93 +C Intel SBR 3.95 +C Intel IBR 3.75 +C Intel HWL 3.62 +C Intel BWL 2.53 +C Intel SKL 2.53 +C Intel atom 21.3 +C Intel SLM 9.0 +C VIA nano 5.0 + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * The loop is great, but the prologue and epilogue code was quickly written. +C Tune it! + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vl', `%rcx') C r9 + +define(`n', `%r11') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`vl', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') + mul vl +IFSTD(` mov %rbx, n ') + + and $3, R32(%rbx) + jz L(b0) + cmp $2, R32(%rbx) + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + ADDSUB %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor R32(%rbx), R32(%rbx) + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + ALIGN(16) +L(top): ADDSUB %r10, (rp,n,8) + adc %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, R32(%r10) +L(L1): mul vl + ADDSUB %r9, 8(rp,n,8) + adc %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + ADDSUB %r8, 16(rp,n,8) + adc %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + ADDSUB %rbx, 24(rp,n,8) + mov $0, R32(%r8) C zero + mov %r8, %rbx C zero + adc %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 C zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + ADDSUB %r10, (rp,n,8) + adc %rax, %r9 + adc %r8, %rdx + ADDSUB %r9, 8(rp,n,8) +L(ret): adc $0, %rdx + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm new file mode 100644 index 0000000..c1dcdc4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm @@ -0,0 +1,186 @@ +dnl AMD64 mpn_addmul_2 optimised for Intel Atom. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom 18.8 this +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + mov n_param, n + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov (up), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + lea -8(rp), rp + jmp L(lo0) + +L(b10): mov %rax, w2 + mov (up), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + lea -16(up), up + lea -24(rp), rp + jmp L(lo2) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up), %rax + xor R32(w1), R32(w1) + lea 8(up), up + dec n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov (up), %rax + mov %rdx, w2 + xor R32(w3), R32(w3) + lea -8(up), up + lea -16(rp), rp + jmp L(lo3) + + ALIGN(16) +L(top): +L(lo1): mul v1 + add w3, (rp) + mov $0, R32(w2) + adc %rax, w0 + mov (up), %rax + adc %rdx, w1 + mul v0 + add %rax, w0 + mov (up), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add w0, 8(rp) + adc %rax, w1 + mov 8(up), %rax + mov $0, R32(w3) + adc %rdx, w2 + mul v0 + add %rax, w1 + mov 8(up), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(lo3): mul v1 + add w1, 16(rp) + adc %rax, w2 + mov 16(up), %rax + mov $0, R32(w0) + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(lo2): mul v1 + add w2, 24(rp) + adc %rax, w3 + mov 24(up), %rax + adc %rdx, w0 + mov $0, R32(w1) + lea 32(rp), rp + mul v0 + lea 32(up), up + add %rax, w3 + adc %rdx, w0 + mov -8(up), %rax + adc $0, R32(w1) + sub $4, n + ja L(top) + +L(end): mul v1 + add w3, (rp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm new file mode 100644 index 0000000..f44de19 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm @@ -0,0 +1,238 @@ +dnl AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom. +dnl Used also for AMD bd1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * This code is slightly large at 433 bytes. +C * sublsh1_n.asm and this file use the same basic pattern. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 2.3 +C AMD bobcat ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.875 (4.75 is probably possible) +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh1_n) + define(func_nc, mpn_addlsh1_nc)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh1_n) + define(func_nc, mpn_rsblsh1_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbp + xor R32(%rbp), R32(%rbp) +L(ent): mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) + cmp $2, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r8 + add %r8, %r8 + lea 8(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 8(up), up + lea 8(rp), rp + jmp L(b0) + +L(b2): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + lea 16(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 16(up), up + lea 16(rp), rp + jmp L(b0) + +L(b3): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + lea 24(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 24(up), up + lea 24(rp), rp + +L(b0): test $4, R8(n) + jz L(skp) + add R32(%rax), R32(%rax) C restore scy + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + lea 32(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + mov %r11, 24(rp) + lea 32(up), up + lea 32(rp), rp + sbb R32(%rbp), R32(%rbp) C save acy + +L(skp): cmp $8, n + jl L(rtn) + + push %r12 + push %r13 + push %r14 + push %rbx + lea -64(rp), rp + jmp L(x) + + ALIGN(16) +L(top): add R32(%rax), R32(%rax) C restore scy + lea 64(rp), rp + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + mov 32(vp), %r12 + adc %r12, %r12 + mov 40(vp), %r13 + adc %r13, %r13 + mov 48(vp), %r14 + adc %r14, %r14 + mov 56(vp), %rbx + adc %rbx, %rbx + lea 64(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + mov %r11, 24(rp) + ADCSBB 32(up), %r12 + mov %r12, 32(rp) + ADCSBB 40(up), %r13 + mov %r13, 40(rp) + ADCSBB 48(up), %r14 + mov %r14, 48(rp) + ADCSBB 56(up), %rbx + mov %rbx, 56(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 64(up), up +L(x): sub $8, n + jge L(top) + +L(end): pop %rbx + pop %r14 + pop %r13 + pop %r12 +L(rtn): +ifdef(`OPERATION_addlsh1_n',` + add R32(%rbp), R32(%rax) + neg R32(%rax)') +ifdef(`OPERATION_rsblsh1_n',` + sub R32(%rax), R32(%rbp) + movslq R32(%rbp), %rax') + + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + neg %r8 C set CF + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm new file mode 100644 index 0000000..02fb29d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm @@ -0,0 +1,191 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] +dnl Optimised for Intel Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5.75 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +define(`LSH', 2) +define(`RSH', 62) +define(M, eval(m4_lshift(1,LSH))) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh2_n) + define(func_nc, mpn_addlsh2_nc)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh2_n) + define(func_nc, mpn_rsblsh2_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) C we rely on rax = 0 at target + cmp $2, R32(%rax) + mov $0, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r9 + lea (%rax,%r9,M), %rbp + shr $RSH, %r9 + sub $1, n + lea -8(up), up + lea -8(rp), rp + jz L(cj1) + mov 8(vp), %r10 + lea (%r9,%r10,M), %r9 + shr $RSH, %r10 + mov 16(vp), %r11 + lea 24(vp), vp + mov (vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + jmp L(L1) + +L(b2): lea -32(rp), rp + mov (vp), %r8 + lea -32(up), up + lea (%rax,%r8,M), %rbx + shr $RSH, %r8 + mov 8(vp), %r9 + sub $2, n + jle L(end) + jmp L(top) + +L(b3): lea -24(up), up + mov (vp), %r11 + lea -24(rp), rp + mov 8(vp), %r8 + lea (%rax,%r11,M), %r10 + shr $RSH, %r11 + lea 8(vp), vp + lea (%r11,%r8,M), %rbx + add $1, n + jmp L(L3) + +L(b0): lea -16(up), up + mov (vp), %r10 + lea (%rax,%r10,M), %r9 + shr $RSH, %r10 + mov 8(vp), %r11 + lea -16(rp), rp + mov 16(vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + lea 16(vp), vp + jmp L(L0) + + ALIGN(16) +L(top): lea (%r8,%r9,M), %rbp + shr $RSH, %r9 + lea 32(up), up + mov 16(vp), %r10 + lea (%r9,%r10,M), %r9 + shr $RSH, %r10 + mov 24(vp), %r11 + lea 32(rp), rp + lea 32(vp), vp + mov (vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + ADCSBB (up), %rbx + mov %rbx, (rp) +L(L1): ADCSBB 8(up), %rbp + mov %rbp, 8(rp) +L(L0): ADCSBB 16(up), %r9 + lea (%r11,%r8,M), %rbx + mov %r9, 16(rp) +L(L3): ADCSBB 24(up), %r10 + sbb R32(%rax), R32(%rax) +L(L2): shr $RSH, %r8 + mov 8(vp), %r9 + mov %r10, 24(rp) + sub $4, n + jg L(top) + +L(end): lea (%r8,%r9,M), %rbp + shr $RSH, %r9 + lea 32(up), up + lea 32(rp), rp + add R32(%rax), R32(%rax) + ADCSBB (up), %rbx + mov %rbx, (rp) +L(cj1): ADCSBB 8(up), %rbp + mov %rbp, 8(rp) + +ifdef(`OPERATION_addlsh2_n',` + mov R32(n), R32(%rax) C zero rax + adc %r9, %rax') +ifdef(`OPERATION_rsblsh2_n',` + sbb n, %r9 C subtract 0 + mov %r9, %rax') + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm b/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm new file mode 100644 index 0000000..83b8df9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm @@ -0,0 +1,128 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Atom. + +dnl Copyright 2011, 2017 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Marco Bodrato. Ported to 64-bit by +dnl Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 2 +C AMD bull 2.34\2.63 +C AMD pile 2.27\2.52 +C AMD steam +C AMD excavator +C AMD bobcat 2.79 +C AMD jaguar 2.78 +C Intel P4 11 +C Intel core2 7.5 +C Intel NHM 8.5 +C Intel SBR 2.11 +C Intel IBR 2.07 +C Intel HWL 1.75 +C Intel BWL 1.51 +C Intel SKL 1.52 +C Intel atom 3 +C Intel SLM 4 +C VIA nano + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func_n, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func_n, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + xor cy, cy C carry + +L(com): shr n C n >> 1 + jz L(1) C n == 1 + jc L(1m2) C n % 2 == 1 + +L(0m2): shr cy + mov (up), %r10 + lea 8(up), up + lea 8(vp), vp + lea -8(rp), rp + jmp L(mid) + +L(1): shr cy + mov (up), %r9 + jmp L(end) + +L(1m2): shr cy + mov (up), %r9 + + ALIGN(16) +L(top): ADCSBB (vp), %r9 + lea 16(up), up + mov -8(up), %r10 + lea 16(vp), vp + mov %r9, (rp) +L(mid): ADCSBB -8(vp), %r10 + lea 16(rp), rp + dec n + mov (up), %r9 + mov %r10, -8(rp) + jnz L(top) + +L(end): ADCSBB (vp), %r9 + mov $0, R32(%rax) + mov %r9, (rp) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() + +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), cy ') + jmp L(com) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm new file mode 100644 index 0000000..7cbc085 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm @@ -0,0 +1,194 @@ +dnl AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bull 4.73 +C AMD pile 4.60 4.80 +C AMD steam +C AMD excavator +C AMD bobcat 5.48 +C AMD jaguar 5.61 +C Intel P4 16.6 +C Intel core2 5.09 +C Intel NHM 4.79 +C Intel SBR 3.88 +C Intel IBR 3.65 +C Intel HWL 3.53 +C Intel BWL 2.75 +C Intel SKL 2.76 +C Intel atom 19.4 +C Intel SLM 8 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + + mov (up), %rax + lea -8(up,n_param,8), up + lea -16(rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): test $2, R8(n_param) + jnz L(b10) + +L(b00): mov $1, R32(n) + sub n_param, n + mul v0 + mov %rax, %r11 + mov 8(up,n,8), %rax + mov %rdx, %r10 + mul v0 + mov %rax, %r8 + mov 16(up,n,8), %rax + jmp L(lo0) + +L(b10): mov $3, R32(n) + sub n_param, n + mul v0 + mov %rax, %r11 + mov -8(up,n,8), %rax + mov %rdx, %r10 + mul v0 + test n, n + jns L(cj2) + mov %rax, %r8 + mov (up,n,8), %rax + mov %rdx, %r9 + jmp L(lo2) + +L(bx1): test $2, R8(n_param) + jnz L(b11) + +L(b01): mov $2, R32(n) + sub n_param, n + mul v0 + test n, n + jns L(cj1) + mov %rax, %r8 + mov (up,n,8), %rax + mov %rdx, %r9 + mul v0 + mov %rax, %r11 + mov 8(up,n,8), %rax + mov %rdx, %r10 + jmp L(lo1) + +L(b11): xor R32(n), R32(n) + sub n_param, n + mul v0 + mov %rax, %r8 + mov 16(up,n,8), %rax + mov %rdx, %r9 + mul v0 + mov %rax, %r11 + mov 24(up,n,8), %rax + jmp L(lo3) + + ALIGN(16) +L(top): mul v0 + ADDSUB %r8, -16(rp,n,8) + mov %rax, %r8 + mov (up,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 +L(lo2): mul v0 + ADDSUB %r11, -8(rp,n,8) + mov %rax, %r11 + mov 8(up,n,8), %rax + adc %r10, %r8 + mov %rdx, %r10 + adc $0, %r9 +L(lo1): mul v0 + ADDSUB %r8, (rp,n,8) + mov %rax, %r8 + adc %r9, %r11 + mov 16(up,n,8), %rax + adc $0, %r10 +L(lo0): mov %rdx, %r9 + mul v0 + ADDSUB %r11, 8(rp,n,8) + mov %rax, %r11 + adc %r10, %r8 + mov 24(up,n,8), %rax + adc $0, %r9 +L(lo3): add $4, n + mov %rdx, %r10 + js L(top) + +L(end): mul v0 + ADDSUB %r8, -16(rp,n,8) + adc %r9, %r11 + adc $0, %r10 +L(cj2): ADDSUB %r11, -8(rp,n,8) + adc %r10, %rax + adc $0, %rdx +L(cj1): ADDSUB %rax, (rp,n,8) + mov $0, R32(%rax) + adc %rdx, %rax + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm new file mode 100644 index 0000000..fcb9a0f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_cnd_add_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_cnd_add_n) +include_mpn(`x86_64/coreisbr/cnd_add_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm new file mode 100644 index 0000000..9eee1c1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_cnd_sub_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_cnd_sub_n) +include_mpn(`x86_64/coreisbr/cnd_sub_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/com.asm b/gmp-6.3.0/mpn/x86_64/atom/com.asm new file mode 100644 index 0000000..6b6460f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for Intel Atom. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/copyd.asm b/gmp-6.3.0/mpn/x86_64/atom/copyd.asm new file mode 100644 index 0000000..e309279 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for Intel Atom. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/copyi.asm b/gmp-6.3.0/mpn/x86_64/atom/copyi.asm new file mode 100644 index 0000000..00ec3c2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for Intel Atom. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm b/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm new file mode 100644 index 0000000..d9ba5fe --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_divexact_1) +include_mpn(`x86_64/nano/dive_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h new file mode 100644 index 0000000..2cd90f6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h @@ -0,0 +1,222 @@ +/* Intel Atom/64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#define SHLD_SLOW 1 +#define SHRD_SLOW 1 + +/* 1600 MHz Diamondville (Atom 330) */ +/* FFT tuning limit = 50,646,641 */ +/* Generated by tuneup.c, 2019-10-16, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 16 + +#define DIV_1_VS_MUL_1_PERCENT 201 + +#define MUL_TOOM22_THRESHOLD 12 +#define MUL_TOOM33_THRESHOLD 74 +#define MUL_TOOM44_THRESHOLD 106 +#define MUL_TOOM6H_THRESHOLD 155 +#define MUL_TOOM8H_THRESHOLD 212 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 58 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 159 +#define SQR_TOOM8_THRESHOLD 236 + +#define MULMID_TOOM42_THRESHOLD 16 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \ + { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ + { 71, 9}, { 143, 8}, { 287,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207,11}, { 111,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,11}, { 223,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 767,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 511,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 575,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 767,13}, \ + { 447,14}, { 255,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 703,14}, { 383,13}, \ + { 831,12}, { 1663,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1407,13}, { 2815,15}, { 767,14}, { 1791,16}, \ + { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,15}, { 1535,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 169 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 11, 6}, { 13, 7}, { 7, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 19, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 63, 8}, \ + { 127, 7}, { 255,10}, { 39, 8}, { 159,10}, \ + { 47, 9}, { 95, 8}, { 191,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 7}, { 511,10}, \ + { 71, 9}, { 143, 8}, { 287, 7}, { 575, 9}, \ + { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319, 8}, \ + { 639,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,11}, { 111,10}, { 223, 9}, \ + { 447,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,11}, { 223,10}, { 447,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 287,10}, \ + { 575,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,12}, { 287,11}, { 575,12}, { 319,11}, \ + { 639,12}, { 351,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 575,13}, { 319,12}, { 703,13}, { 383,12}, \ + { 767,13}, { 447,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,15}, { 255,14}, \ + { 511,13}, { 1151,14}, { 639,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2047,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,15}, { 1535,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 172 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 4392 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 85 +#define SQRLO_SQR_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 34 +#define DC_DIVAPPR_Q_THRESHOLD 119 +#define DC_BDIV_QR_THRESHOLD 31 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 22 +#define INV_NEWTON_THRESHOLD 149 +#define INV_APPR_THRESHOLD 123 + +#define BINV_NEWTON_THRESHOLD 179 +#define REDC_1_TO_REDC_2_THRESHOLD 24 +#define REDC_2_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 807 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 77 +#define MU_BDIV_QR_THRESHOLD 748 +#define MU_BDIV_Q_THRESHOLD 807 + +#define POWM_SEC_TABLE 1,22,114,326,1486 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1565 + +#define FAC_DSC_THRESHOLD 960 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD2_DIV1_METHOD 3 /* 5.86% faster than 4 */ +#define HGCD_THRESHOLD 88 +#define HGCD_APPR_THRESHOLD 88 +#define HGCD_REDUCE_THRESHOLD 1182 +#define GCD_DC_THRESHOLD 241 +#define GCDEXT_DC_THRESHOLD 192 +#define JACOBI_BASE_METHOD 3 /* 9.43% faster than 2 */ + +/* Tuneup completed successfully, took 193098 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/atom/lshift.asm b/gmp-6.3.0/mpn/x86_64/atom/lshift.asm new file mode 100644 index 0000000..1b37d5d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/lshift.asm @@ -0,0 +1,123 @@ +dnl AMD64 mpn_lshift -- mpn left shift, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times +C larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + lea -8(up,n,8), up + lea -8(rp,n,8), rp + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shl R8(%rcx), %r11 + neg R8(%rcx) + shr R8(%rcx), %rax + test n, n + jnz L(gt1) + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov -8(up), %r8 + mov %r8, %r10 + shr R8(%rcx), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(%rcx) + shr R8(%rcx), %rax + mov -8(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + neg R8(%rcx) + dec n + lea 8(rp), rp + lea -8(up), up + jz L(end) + + ALIGN(8) +L(top): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + neg R8(%rcx) + mov -8(up), %r8 + mov %r8, %r10 + mov %r9, -8(rp) + shr R8(%rcx), %r8 + lea -16(rp), rp +L(lo1): mov -16(up), %r9 + or %r11, %r8 + mov %r9, %r11 + shr R8(%rcx), %r9 + lea -16(up), up + neg R8(%rcx) + mov %r8, (rp) + dec n + jg L(top) + +L(end): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + mov %r9, -8(rp) + mov %r11, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm new file mode 100644 index 0000000..7385f8f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm @@ -0,0 +1,127 @@ +dnl AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4.5 c/l, but the code is 2.5 +C times larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + lea -8(up,n,8), up + lea -8(rp,n,8), rp + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shl R8(%rcx), %r11 + neg R8(%rcx) + shr R8(%rcx), %rax + test n, n + jnz L(gt1) + not %r11 + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov -8(up), %r8 + mov %r8, %r10 + shr R8(%rcx), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(%rcx) + shr R8(%rcx), %rax + mov -8(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + neg R8(%rcx) + lea 8(rp), rp + lea -8(up), up + jmp L(lo0) + +C ALIGN(16) +L(top): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + not %r9 + neg R8(%rcx) + mov -8(up), %r8 + lea -16(rp), rp + mov %r8, %r10 + shr R8(%rcx), %r8 + mov %r9, 8(rp) +L(lo1): or %r11, %r8 + mov -16(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + lea -16(up), up + neg R8(%rcx) + not %r8 + mov %r8, (rp) +L(lo0): dec n + jg L(top) + +L(end): shl R8(%rcx), %r10 + or %r10, %r9 + not %r9 + shl R8(%rcx), %r11 + not %r11 + mov %r9, -8(rp) + mov %r11, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm b/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm new file mode 100644 index 0000000..a0dcf1e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm @@ -0,0 +1,147 @@ +dnl AMD64 mpn_mul_1 optimised for Intel Atom. + +dnl Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.03 +C AMD K10 3.03 +C AMD bull 4.74 +C AMD pile 4.56 +C AMD steam +C AMD excavator +C AMD bobcat 5.56 6.04 +C AMD jaguar 5.55 5.84 +C Intel P4 13.05 +C Intel core2 4.03 +C Intel NHM 3.80 +C Intel SBR 2.75 +C Intel IBR 2.69 +C Intel HWL 2.50 +C Intel BWL 2.55 +C Intel SKL 2.57 +C Intel atom 17.3 +C Intel SLM 14.7 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + xor %r8, %r8 +L(com): mov (up), %rax + lea -16(up,n_param,8), up + lea -8(rp,n_param,8), rp + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): mov %r8, %r9 + test $2, R8(n_param) + jnz L(b10) + +L(b00): mov $2, R32(n) + sub n_param, n + jmp L(lo0) + +L(bx1): test $2, R8(n_param) + jnz L(b11) + +L(b01): mov $3, R32(n) + sub n_param, n + mul v0 + cmp $2, n + jnz L(lo1) + jmp L(cj1) + +L(b11): mov $1, R32(n) + sub n_param, n + jmp L(lo3) + +L(b10): xor R32(n), R32(n) + sub n_param, n + jmp L(lo2) + +L(top): mul v0 + mov %r9, -24(rp,n,8) +L(lo1): xor %r9d, %r9d + add %rax, %r8 + mov (up,n,8), %rax + adc %rdx, %r9 + mov %r8, -16(rp,n,8) +L(lo0): xor %r8d, %r8d + mul v0 + add %rax, %r9 + mov 8(up,n,8), %rax + adc %rdx, %r8 + mov %r9, -8(rp,n,8) +L(lo3): xor %r9d, %r9d + mul v0 + add %rax, %r8 + mov 16(up,n,8), %rax + adc %rdx, %r9 + mov %r8, (rp,n,8) +L(lo2): xor %r8d, %r8d + mul v0 + add %rax, %r9 + mov 24(up,n,8), %rax + adc %rdx, %r8 + add $4, n + js L(top) + +L(end): mul v0 + mov %r9, -8(rp) +L(cj1): add %rax, %r8 + mov $0, R32(%rax) + adc %rdx, %rax + mov %r8, (rp) + FUNC_EXIT() + ret +EPILOGUE() + +PROLOGUE(mpn_mul_1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(com) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm b/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm new file mode 100644 index 0000000..4bc22cd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_mul_2 optimised for Intel Atom. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 5.78 +C AMD K10 5.78 +C AMD bull 9.10 +C AMD pile 9.17 +C AMD steam +C AMD excavator +C AMD bobcat 11.3 +C AMD jaguar 10.9 +C Intel P4 24.6 +C Intel core2 8.06 +C Intel NHM 7.65 +C Intel SBR 6.28 +C Intel IBR 6.10 +C Intel HWL 6.09 +C Intel BWL 4.73 +C Intel SKL 4.77 +C Intel atom 35.3 +C Intel SLM 25.6 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + mov n_param, n + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov (up), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + lea -8(rp), rp + jmp L(lo0) + +L(b10): mov %rax, w2 + mov (up), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + lea -16(up), up + lea -24(rp), rp + jmp L(lo2) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up), %rax + xor R32(w1), R32(w1) + lea 8(up), up + dec n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov (up), %rax + mov %rdx, w2 + xor R32(w3), R32(w3) + lea -8(up), up + lea -16(rp), rp + jmp L(lo3) + + ALIGN(16) +L(top): +L(lo1): mul v1 + add %rax, w0 + mov (up), %rax + mov $0, R32(w2) + mov w3, (rp) + adc %rdx, w1 + mul v0 + add %rax, w0 + mov (up), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add %rax, w1 + mov 8(up), %rax + mov w0, 8(rp) + adc %rdx, w2 + mul v0 + add %rax, w1 + mov 8(up), %rax + adc %rdx, w2 + mov $0, R32(w3) + adc $0, R32(w3) +L(lo3): mul v1 + add %rax, w2 + mov 16(up), %rax + mov w1, 16(rp) + mov $0, R32(w0) + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up), %rax + adc %rdx, w3 +L(lo2): mov $0, R32(w1) + mov w2, 24(rp) + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov 24(up), %rax + lea 32(up), up + adc %rdx, w0 + mul v0 + lea 32(rp), rp + add %rax, w3 + adc %rdx, w0 + mov -8(up), %rax + adc $0, R32(w1) + sub $4, n + ja L(top) + +L(end): mul v1 + mov w3, (rp) + add %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/popcount.asm b/gmp-6.3.0/mpn/x86_64/atom/popcount.asm new file mode 100644 index 0000000..fb14dd3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/popcount.asm @@ -0,0 +1,35 @@ +dnl x86-64 mpn_popcount. + +dnl Copyright 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm b/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm new file mode 100644 index 0000000..62b9a84 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm @@ -0,0 +1,579 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Atom. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat 5.0 +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. +C * Make lead-in code for the inner loops be more similar. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') +define(`w0', `%rbp') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea (up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %rbp + mov 8(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + add (up,n,8), %rbp + mov %rax, %rbp + adc %r9, %rbx + mov 24(mp,n,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %r11 + adc %r10, %rbp + mov 32(mp,n,8), %rax + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 +L(e1): add $4, i + mov %rdx, %r10 + js L(tp1) + +L(ed1): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %rbp + mov 8(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + add (up,n,8), %rbp + mov %rax, %rbp + mov 24(mp,n,8), %rax + adc %r9, %rbx + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %r11 + mov 32(mp,n,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 +L(e3): mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp3) + +L(ed3): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): cmp $-4, R32(n) + jz L(n4) + +L(otp0):lea 4(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %r11 + mov 8(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + add (up,n,8), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov 24(mp,n,8), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 +L(e0): mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp0) + +L(ed0): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %r11 + mov 8(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + add (up,n,8), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov 24(mp,n,8), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 +L(e2): mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp2) + +L(ed2): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -8(up), %rax + adc (up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov (up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 8(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -24(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov -8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -16(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -8(up) + mov %r11, -24(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -48(up), %rdx + mov -40(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc -8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n4): mov -32(mp), %rax + mul q0 + mov %rax, %r11 + mov -24(mp), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov -16(mp), %rax + mov %rdx, %r9 + mul q0 + add -32(up), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov -8(mp), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add -24(up), %rbx + mov %rbx, -24(up) + adc %r9, %r11 + adc $0, %r10 + imul u0inv, %rbx C next q limb + add %r11, -16(up) + adc %r10, %rax + adc $0, %rdx + add %rax, -8(up) + adc $0, %rdx + mov %rdx, -32(up) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + dec j + lea 8(up), up C up++ + jnz L(n4) + jmp L(cj) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm new file mode 100644 index 0000000..6f5f638 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm @@ -0,0 +1,287 @@ +dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Schedule loop less. It is now almost surely overscheduled, resulting in +C large feed-in and wind-down code. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NMH ? +C Intel SBR ? +C Intel atom 5.25 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), %r15 + ADDSUB (vp), %r15 + sbb R32(%rbx), R32(%rbx) + xor R32(%rax), R32(%rax) + shr %r15 + adc R32(%rax), R32(%rax) C return value + + mov R32(n), R32(%rbp) + and $3, R32(%rbp) + jz L(b0) + cmp $2, R32(%rbp) + jae L(b23) + +L(b1): dec n + jnz L(gt1) + shl $63, %rbx + add %rbx, %r15 + mov %r15, (rp) + jmp L(cj1) +L(gt1): lea 24(up), up + lea 24(vp), vp + mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + lea 24(rp), rp + mov (up), %r11 + ADCSBB -16(vp), %r9 + ADCSBB -8(vp), %r10 + mov %r15, %r12 + ADCSBB (vp), %r11 + mov %r9, %r13 + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + mov %r10, %r14 + shl $63, %r11 + shl $63, %r10 + shl $63, %r9 + or %r9, %r12 + shr %r13 + mov 8(up), %r8 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + sub $4, n + jz L(cj5) +L(gt5): mov 16(up), %r9 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov 32(up), %r11 + jmp L(lo1) + +L(b23): jnz L(b3) + mov 8(up), %r8 + sub $2, n + jnz L(gt2) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + mov %r8, %r12 + jmp L(cj2) +L(gt2): mov 16(up), %r9 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov 32(up), %r11 + ADCSBB 16(vp), %r9 + lea 32(up), up + ADCSBB 24(vp), %r10 + mov %r9, %r13 + ADCSBB 32(vp), %r11 + mov %r8, %r12 + jmp L(lo2) + +L(b3): lea 40(up), up + lea 8(vp), vp + mov %r15, %r14 + add R32(%rbx), R32(%rbx) + mov -32(up), %r11 + ADCSBB 0(vp), %r11 + lea 8(rp), rp + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + shl $63, %r11 + mov -24(up), %r8 + shr %r15 + or %r11, %r14 + sub $3, n + jnz L(gt3) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + jmp L(cj3) +L(gt3): mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov (up), %r11 + ADCSBB 16(vp), %r9 + ADCSBB 24(vp), %r10 + mov %r8, %r12 + jmp L(lo3) + +L(b0): lea 48(up), up + lea 16(vp), vp + add R32(%rbx), R32(%rbx) + mov -40(up), %r10 + lea 16(rp), rp + mov -32(up), %r11 + ADCSBB -8(vp), %r10 + mov %r15, %r13 + ADCSBB (vp), %r11 + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + mov %r10, %r14 + shl $63, %r11 + shl $63, %r10 + mov -24(up), %r8 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + sub $4, n + jnz L(gt4) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + jmp L(cj4) +L(gt4): mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov (up), %r11 + ADCSBB 16(vp), %r9 + jmp L(lo0) + + ALIGN(8) +L(top): mov 16(up), %r9 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + mov %rbp, (rp) + ADCSBB 8(vp), %r8 + mov %r15, %rbp + lea 32(rp), rp + mov 32(up), %r11 +L(lo1): ADCSBB 16(vp), %r9 + lea 32(up), up + mov %r12, -24(rp) +L(lo0): ADCSBB 24(vp), %r10 + mov %r8, %r12 + mov %r13, -16(rp) +L(lo3): ADCSBB 32(vp), %r11 + mov %r9, %r13 + mov %r14, -8(rp) +L(lo2): sbb R32(%rbx), R32(%rbx) + shl $63, %r8 + mov %r11, %r15 + shr %r12 + mov %r10, %r14 + shl $63, %r9 + lea 32(vp), vp + shl $63, %r10 + or %r8, %rbp + shl $63, %r11 + or %r9, %r12 + shr %r13 + mov 8(up), %r8 + sub $4, n + jg L(top) + +L(end): shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + mov %rbp, (rp) + lea 32(rp), rp +L(cj5): add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + mov %r12, -24(rp) +L(cj4): mov %r13, -16(rp) +L(cj3): mov %r8, %r12 + mov %r14, -8(rp) +L(cj2): sbb R32(%rbx), R32(%rbx) + shl $63, %r8 + shr %r12 + or %r8, %r15 + shl $63, %rbx + add %rbx, %r12 + mov %r15, (rp) + mov %r12, 8(rp) +L(cj1): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/rshift.asm b/gmp-6.3.0/mpn/x86_64/atom/rshift.asm new file mode 100644 index 0000000..29c027d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/rshift.asm @@ -0,0 +1,121 @@ +dnl AMD64 mpn_rshift -- mpn right shift, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times +C larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shr R8(cnt), %r11 + neg R8(cnt) + shl R8(cnt), %rax + test n, n + jnz L(gt1) + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov 8(up), %r8 + mov %r8, %r10 + shl R8(cnt), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(cnt) + shl R8(cnt), %rax + mov 8(up), %r9 + mov %r9, %r11 + shl R8(cnt), %r9 + neg R8(cnt) + dec n + lea -8(rp), rp + lea 8(up), up + jz L(end) + + ALIGN(8) +L(top): shr R8(cnt), %r10 + or %r10, %r9 + shr R8(cnt), %r11 + neg R8(cnt) + mov 8(up), %r8 + mov %r8, %r10 + mov %r9, 8(rp) + shl R8(cnt), %r8 + lea 16(rp), rp +L(lo1): mov 16(up), %r9 + or %r11, %r8 + mov %r9, %r11 + shl R8(cnt), %r9 + lea 16(up), up + neg R8(cnt) + mov %r8, (rp) + dec n + jg L(top) + +L(end): shr R8(cnt), %r10 + or %r10, %r9 + shr R8(cnt), %r11 + mov %r9, 8(rp) + mov %r11, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm new file mode 100644 index 0000000..1306acd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm @@ -0,0 +1,242 @@ +dnl AMD64 mpn_sublsh1_n optimised for Intel Atom. +dnl Used also for AMD bd1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * This code is slightly large at 501 bytes. +C * aorrlsh1_n.asm and this file use the same basic pattern. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 2.3 +C AMD bobcat ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5 (4.875 is probably possible) +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sublsh1_n) + FUNC_ENTRY(4) + push %rbp + push %r15 + xor R32(%rbp), R32(%rbp) +L(ent): mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) + cmp $2, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r8 + add %r8, %r8 + lea 8(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 8(up), up + lea 8(rp), rp + jmp L(b0) + +L(b2): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + lea 16(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 16(up), up + lea 16(rp), rp + jmp L(b0) + +L(b3): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + lea 24(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + mov 16(up), %r15 + sbb %r10, %r15 + mov %r15, 16(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 24(up), up + lea 24(rp), rp + +L(b0): test $4, R8(n) + jz L(skp) + add R32(%rax), R32(%rax) C restore scy + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + lea 32(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + mov 16(up), %r15 + sbb %r10, %r15 + mov %r15, 16(rp) + mov 24(up), %r15 + sbb %r11, %r15 + mov %r15, 24(rp) + lea 32(up), up + lea 32(rp), rp + sbb R32(%rbp), R32(%rbp) C save acy + +L(skp): cmp $8, n + jl L(rtn) + + push %r12 + push %r13 + push %r14 + push %rbx + lea -64(rp), rp + jmp L(x) + + ALIGN(16) +L(top): mov (vp), %r8 + add R32(%rax), R32(%rax) + lea 64(vp), vp + adc %r8, %r8 + mov -56(vp), %r9 + adc %r9, %r9 + mov -48(vp), %r10 + adc %r10, %r10 + mov -40(vp), %r11 + adc %r11, %r11 + mov -32(vp), %r12 + adc %r12, %r12 + mov -24(vp), %r13 + adc %r13, %r13 + mov -16(vp), %r14 + adc %r14, %r14 + mov -8(vp), %r15 + adc %r15, %r15 + sbb R32(%rax), R32(%rax) + add R32(%rbp), R32(%rbp) + mov (up), %rbp + lea 64(rp), rp + mov 8(up), %rbx + sbb %r8, %rbp + mov 32(up), %r8 + mov %rbp, (rp) + sbb %r9, %rbx + mov 16(up), %rbp + mov %rbx, 8(rp) + sbb %r10, %rbp + mov 24(up), %rbx + mov %rbp, 16(rp) + sbb %r11, %rbx + mov %rbx, 24(rp) + sbb %r12, %r8 + mov 40(up), %r9 + mov %r8, 32(rp) + sbb %r13, %r9 + mov 48(up), %rbp + mov %r9, 40(rp) + sbb %r14, %rbp + mov 56(up), %rbx + mov %rbp, 48(rp) + sbb %r15, %rbx + lea 64(up), up + mov %rbx, 56(rp) + sbb R32(%rbp), R32(%rbp) +L(x): sub $8, n + jge L(top) + +L(end): pop %rbx + pop %r14 + pop %r13 + pop %r12 +L(rtn): + add R32(%rbp), R32(%rax) + neg R32(%rax) + + pop %r15 + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(mpn_sublsh1_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + push %r15 + neg %r8 C set CF + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/README b/gmp-6.3.0/mpn/x86_64/bd1/README new file mode 100644 index 0000000..ccd210e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/README @@ -0,0 +1,11 @@ +This directory contains code for AMD bulldozer including its piledriver update. + +We currently make limited use of SIMD instructions, both via the MPN_PATH and +via inclusion of x86_64/fastsse files. + +The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably +means that an all-core GMP load (such as a HPC load) might run slower if there +is significant SIMD dependency. + +We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any +SIMD code. diff --git a/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm new file mode 100644 index 0000000..b54e91a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm @@ -0,0 +1,235 @@ +dnl AMD64 mpn_addmul_2 optimised for AMD Bulldozer. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 4.2 +C AMD bd2 4.4 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bt1 +C AMD bt2 +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') +define(`X0', `%r12') +define(`X1', `%r13') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + mov $0, R32(w2) C abuse w2 + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + sub n_param, w2 + mul v0 + + test $1, R8(w2) + jnz L(bx1) + +L(bx0): mov %rdx, X0 + mov %rax, X1 + test $2, R8(w2) + jnz L(b10) + +L(b00): lea (w2), n C un = 4, 8, 12, ... + mov (up,w2,8), %rax + mov (rp,w2,8), w3 + mul v1 + mov %rax, w0 + mov 8(up,w2,8), %rax + mov %rdx, w1 + jmp L(lo0) + +L(b10): lea 2(w2), n C un = 2, 6, 10, ... + mov (up,w2,8), %rax + mov (rp,w2,8), w1 + mul v1 + mov %rdx, w3 + mov %rax, w2 + mov -8(up,n,8), %rax + test n, n + jz L(end) + jmp L(top) + +L(bx1): mov %rax, X0 + mov %rdx, X1 + test $2, R8(w2) + jz L(b11) + +L(b01): lea 1(w2), n C un = 1, 5, 9, ... + mov (up,w2,8), %rax + mul v1 + mov (rp,w2,8), w2 + mov %rdx, w0 + mov %rax, w3 + jmp L(lo1) + +L(b11): lea -1(w2), n C un = 3, 7, 11, ... + mov (up,w2,8), %rax + mul v1 + mov (rp,w2,8), w0 + mov %rax, w1 + mov 8(up,w2,8), %rax + mov %rdx, w2 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,n,8) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up,n,8), %rax + mul v1 + mov -8(rp,n,8), w1 + mov %rdx, w0 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X0, -8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + mov (up,n,8), %rax + adc $0, X0 + mov (rp,n,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(lo0): mul v0 + add w3, X1 + mov X1, (rp,n,8) + adc %rax, X0 + mov 8(up,n,8), %rax + mov %rdx, X1 + adc $0, X1 + mov 8(rp,n,8), w3 + mul v1 + add w3, w0 + adc %rax, w1 + mov 16(up,n,8), %rax + mov %rdx, w2 + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + adc $0, X0 + mov 16(up,n,8), %rax + mov 16(rp,n,8), w0 + mul v1 + mov %rdx, w3 + add w0, w1 + adc %rax, w2 + adc $0, w3 + mov 24(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + add w1, X1 + mov X1, -16(rp) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up), %rax + mul v1 + mov -8(rp), w1 + add w1, w2 + adc %rax, w3 + adc $0, %rdx + add w2, X0 + adc $0, X1 + mov X0, -8(rp) + add w3, X1 + mov X1, (rp) + adc $0, %rdx + mov %rdx, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm new file mode 100644 index 0000000..c34a5fa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_addlsh1_n and mpn_rsblsh1_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/atom/aorrlsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm new file mode 100644 index 0000000..5516c9d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm new file mode 100644 index 0000000..143c42e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreihwl/aors_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm new file mode 100644 index 0000000..fc0d2fe --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.30 3.58 +C AMD K10 3.09 +C AMD bull 4.47 4.72 +C AMD pile 4.66 +C AMD steam +C AMD excavator +C AMD bobcat 6.30 +C AMD jaguar 6.29 +C Intel P4 17.3 17.8 +C Intel core2 5.13 +C Intel NHM 4.85 +C Intel SBR 3.83 +C Intel IBR 3.75 +C Intel HWL 3.45 +C Intel BWL 2.56 +C Intel SKL 2.53 +C Intel atom 20.3 +C Intel SLM 9 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%r11') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`v0', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') + mul v0 + +IFSTD(` mov %rbx, n ') + + and $3, R32(%rbx) + lea -16(rp,n,8), rp + jz L(b0) + cmp $2, R32(%rbx) + jb L(b1) + jz L(b2) + +L(b3): mov $0, R32(%r8) + mov %rax, %rbx + mov $0, R32(%r9) + mov 8(up), %rax + mov %rdx, %r10 + lea (up,n,8), up + not n + jmp L(L3) + +L(b0): mov $0, R32(%r10) + mov %rax, %r8 + mov %rdx, %rbx + mov 8(up), %rax + lea (up,n,8), up + neg n + jmp L(L0) + +L(b1): cmp $1, n + jz L(n1) + mov %rax, %r9 + mov 8(up), %rax + mov %rdx, %r8 + mov $0, R32(%rbx) + lea (up,n,8), up + neg n + inc n + jmp L(L1) + +L(b2): mov $0, R32(%rbx) + mov %rax, %r10 + mov %rdx, %r9 + mov 8(up), %rax + mov $0, R32(%r8) + lea (up,n,8), up + neg n + add $2, n + jns L(end) + + ALIGN(32) +L(top): mul v0 + ADDSUB %r10, (rp,n,8) + adc %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 +L(L1): mul v0 + mov $0, R32(%r10) + ADDSUB %r9, 8(rp,n,8) + adc %rax, %r8 + adc %rdx, %rbx + mov 8(up,n,8), %rax +L(L0): mul v0 + ADDSUB %r8, 16(rp,n,8) + mov $0, R32(%r8) + adc %rax, %rbx + mov $0, R32(%r9) + mov 16(up,n,8), %rax + adc %rdx, %r10 +L(L3): mul v0 + ADDSUB %rbx, 24(rp,n,8) + mov $0, R32(%rbx) + adc %rax, %r10 + adc %rdx, %r9 + mov 24(up,n,8), %rax + add $4, n + js L(top) + +L(end): mul v0 + ADDSUB %r10, (rp) + adc %r9, %rax + adc %r8, %rdx +L(n1): ADDSUB %rax, 8(rp) + adc $0, %rdx + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/com.asm b/gmp-6.3.0/mpn/x86_64/bd1/com.asm new file mode 100644 index 0000000..43f3561 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm b/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm new file mode 100644 index 0000000..675cdc3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm b/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm new file mode 100644 index 0000000..ceef036 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h new file mode 100644 index 0000000..210f382 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h @@ -0,0 +1,265 @@ +/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600-3800 MHz Bulldozer Zambezi */ +/* FFT tuning limit = 464,627,200 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 275 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 161 +#define MUL_TOOM6H_THRESHOLD 226 +#define MUL_TOOM8H_THRESHOLD 339 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 61 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 234 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 466 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63, 7}, \ + { 1023, 8}, { 543, 9}, { 303,10}, { 167,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,10}, { 2687,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \ + { 1599,12}, { 831,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \ + { 895,15}, { 255,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1215,12}, \ + { 2431,11}, { 4863,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,12}, { 5631,13}, { 2943,12}, { 5887,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 251 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 364 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 364, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95, 7}, \ + { 1535, 8}, { 799, 7}, { 1599, 8}, { 831, 9}, \ + { 447,10}, { 239,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 303,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 303,12}, { 159,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \ + { 191,12}, { 383,11}, { 767,10}, { 1535,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,10}, { 2175,12}, { 575,11}, { 1151,12}, \ + { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,13}, { 447,12}, \ + { 895,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 767,12}, { 1599,11}, { 3199,13}, \ + { 831,12}, { 1727,11}, { 3455,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \ + { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,11}, { 6911,14}, { 895,13}, { 1791,12}, \ + { 3583,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,12}, { 5887,11}, { 11775,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,13}, \ + { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4351,12}, { 8703,13}, { 4479,12}, { 8959,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4351,13}, { 8703,14}, \ + { 4479,13}, { 8959,15}, { 2303,14}, { 4991,13}, \ + { 9983,15}, { 2559,14}, { 5119,15}, { 2815,14}, \ + { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \ + { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \ + { 4095,14}, { 8191,15}, { 4351,14}, { 8959,15}, \ + { 4863,14}, { 9983,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 275 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 23 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 167 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 93 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 197 +#define INV_APPR_THRESHOLD 179 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 32 +#define REDC_2_TO_REDC_N_THRESHOLD 55 + +#define MU_DIV_QR_THRESHOLD 1387 +#define MU_DIVAPPR_Q_THRESHOLD 1387 +#define MUPI_DIV_QR_THRESHOLD 92 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1334 + +#define POWM_SEC_TABLE 1,22,194,434,452 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 438 +#define SET_STR_PRECOMPUTE_THRESHOLD 1254 + +#define FAC_DSC_THRESHOLD 189 +#define FAC_ODD_THRESHOLD 26 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 3 /* 2.31% faster than 4 */ +#define HGCD_THRESHOLD 104 +#define HGCD_APPR_THRESHOLD 52 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 283 +#define JACOBI_BASE_METHOD 4 /* 5.81% faster than 1 */ + +/* Tuneup completed successfully, took 554602 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm b/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm new file mode 100644 index 0000000..799cdda --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm @@ -0,0 +1,206 @@ +dnl AMD64 SSSE3/XOP mpn_hamdist -- hamming distance. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.51-2.0 y +C AMD bd2 1.50-1.9 y +C AMD bd3 ? +C AMD bd4 ? +C AMD zen n/a +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL n/a +C Intel SKL n/a +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we +C intend to support old systems. + +C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some +C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. +C We fall back to the core2 code. +ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/core2/hamdist.asm') +',` + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + FUNC_ENTRY(3) + cmp $5, n + jl L(sma) + + lea L(cnsts)(%rip), %r9 + + xor R32(%r10), R32(%r10) + test $8, R8(vp) + jz L(ali) + mov (up), %r8 + xor (vp), %r8 + add $8, up + add $8, vp + dec n + popcnt %r8, %r10 +L(ali): + +ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)', + `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)') + movdqa OFF1`'(%r9), %xmm7 C nibble counts table + movdqa OFF2`'(%r9), %xmm6 C splat shift counts + movdqa OFF3`'(%r9), %xmm5 C masks + pxor %xmm4, %xmm4 + pxor %xmm8, %xmm8 C grand total count + + mov R32(n), R32(%rax) + and $6, R32(%rax) + lea -64(up,%rax,8), up + lea -64(vp,%rax,8), vp +ifdef(`PIC',` + movslq (%r9,%rax,2), %r11 + add %r9, %r11 + jmp *%r11 +',` + jmp *(%r9,%rax,4) +') + +L(0): add $64, up + add $64, vp + sub $2, n + + ALIGN(32) +L(top): lddqu (up), %xmm0 + pxor (vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(6): lddqu 16(up), %xmm0 + pxor 16(vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(4): lddqu 32(up), %xmm0 + pxor 32(vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 + .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 + paddb %xmm2, %xmm3 + paddb %xmm2, %xmm4 + paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts +L(2): mov 48(up), %r8 + mov 56(up), %r9 + add $64, up + xor 48(vp), %r8 + xor 56(vp), %r9 + add $64, vp + popcnt %r8, %r8 + popcnt %r9, %r9 + add %r8, %r10 + add %r9, %r10 + sub $8, n + jg L(top) + + test $1, R8(n) + jz L(x) + mov (up), %r8 + xor (vp), %r8 + popcnt %r8, %r8 + add %r8, %r10 +L(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 + paddq %xmm0, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + add %r10, %rax + FUNC_EXIT() + ret + +L(sma): mov (up), %r8 + xor (vp), %r8 + popcnt %r8, %rax + dec n + jz L(ed) +L(tp): mov 8(up), %r8 + add $8, up + xor 8(vp), %r8 + add $8, vp + popcnt %r8, %r8 + add %r8, %rax + dec n + jnz L(tp) +L(ed): FUNC_EXIT() + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(0), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(6), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) +') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm new file mode 100644 index 0000000..2fb097f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm @@ -0,0 +1,193 @@ +dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.65 +C AMD K10 3.30 3.68 +C AMD bull 4.04 4.29 +C AMD pile 4.33 +C AMD steam +C AMD excavator +C AMD bobcat 5.73 +C AMD jaguar 5.87 +C Intel P4 12.5 +C Intel core2 4.38 +C Intel NHM 4.28 +C Intel SBR 2.69 +C Intel IBR 2.55 +C Intel HWL 2.41 +C Intel BWL 2.49 +C Intel SKL 2.50 +C Intel atom 20.3 +C Intel SLM 7.8 +C VIA nano 4.25 + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Move loop code into feed-in blocks, to save insn for zeroing regs. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`v0', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``rbx'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it +IFDOS(` mov n, %r11 ') + mul v0 + +IFSTD(` add %r8, %rax ') +IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns) + adc $0, %rdx + jmp L(common) + +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mul_1) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it +IFDOS(` mov n, %r11 ') + mul v0 + +L(common): +IFSTD(` mov %r11, n ') + + and $3, R32(%r11) + lea -16(rp,n,8), rp + jz L(b0) + cmp $2, R32(%r11) + jb L(b1) + jz L(b2) + +L(b3): mov %rax, %r10 + mov %rdx, %r11 + mov 8(up), %rax + mul v0 + lea (up,n,8), up + not n + jmp L(L3) + +L(b0): mov %rax, %r9 + mov %rdx, %r10 + mov 8(up), %rax + lea (up,n,8), up + neg n + jmp L(L0) + +L(b1): mov %rax, %r8 + cmp $1, n + jz L(n1) + mov %rdx, %r9 + lea (up,n,8), up + neg n + mov %r8, 16(rp,n,8) + inc n + jmp L(L1) + +L(b2): mov %rax, %r11 + mov %rdx, %r8 + mov 8(up), %rax + lea (up,n,8), up + neg n + add $2, n + jns L(end) + + ALIGN(16) +L(top): mul v0 + mov %rdx, %r9 + add %rax, %r8 + adc $0, %r9 + mov %r8, 8(rp,n,8) + mov %r11, (rp,n,8) +L(L1): mov (up,n,8), %rax + mul v0 + add %rax, %r9 + mov %rdx, %r10 + mov 8(up,n,8), %rax + adc $0, %r10 +L(L0): mul v0 + add %rax, %r10 + mov %rdx, %r11 + mov 16(up,n,8), %rax + adc $0, %r11 + mul v0 + mov %r9, 16(rp,n,8) +L(L3): add %rax, %r11 + mov %r10, 24(rp,n,8) + mov %rdx, %r8 + adc $0, %r8 + add $4, n + mov -8(up,n,8), %rax + js L(top) + +L(end): mul v0 + add %rax, %r8 + adc $0, %rdx + mov %r11, (rp) +L(n1): mov %r8, 8(rp) + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm new file mode 100644 index 0000000..85fa7aa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 6.78 +C AMD K10 6.78 +C AMD bd1 8.39 8.65 +C AMD bd2 8.47 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bt1 12.1 +C AMD bt2 11.5 +C Intel P4 24.0 +C Intel PNR 8.14 +C Intel NHM 7.78 +C Intel SBR 6.34 +C Intel IBR 6.15 +C Intel HWL 6.04 +C Intel BWL 4.33 +C Intel SKL 4.41 +C Intel atom 39.5 +C Intel SLM 27.8 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + mov n_param, n + mul v0 + neg n + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov (up,n,8), %rax + jmp L(lo0) + +L(b10): mov %rax, w2 + mov %rdx, w3 + mov (up,n,8), %rax + xor R32(w0), R32(w0) + mul v1 + add $-2, n + jmp L(lo2) + +L(bx1): test $2, R8(n) + jz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up,n,8), %rax + mul v1 + xor R32(w1), R32(w1) + inc n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov %rdx, w2 + mov (up,n,8), %rax + xor R32(w3), R32(w3) + dec n + jmp L(lo3) + + ALIGN(32) +L(top): mov -8(up,n,8), %rax + mul v1 + mov w2, -16(rp,n,8) +L(lo1): add %rax, w0 + mov w3, -8(rp,n,8) + adc %rdx, w1 + mov (up,n,8), %rax + mul v0 + mov $0, R32(w2) + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) + mov (up,n,8), %rax +L(lo0): mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,n,8), %rax + mul v0 + add %rax, w1 + mov w0, (rp,n,8) + mov $0, R32(w3) + mov 8(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(lo3): mul v1 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up,n,8), %rax + mov $0, R32(w0) + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov w1, 8(rp,n,8) +L(lo2): add %rax, w3 + adc %rdx, w0 + mov 24(up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + mov $0, R32(w1) + adc $0, R32(w1) + add $4, n + jnc L(top) + +L(end): mov -8(up), %rax + mul v1 + mov w2, -16(rp) + add %rax, w0 + mov w3, -8(rp) + adc %rdx, w1 + mov w0, (rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm new file mode 100644 index 0000000..e47ba58 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm @@ -0,0 +1,416 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull ~4.8 ~4.55 - ~4.3 +C AMD pile ~4.6 ~4.55 - ~4.55 +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Merge bull-specific mul_1, if it is not slower the TOOM22 range. +C Alternatively, we could tweak the present code (which was loopmixed for a +C different CPU). +C * Merge faster mul_2, such as the one in the same directory as this file. +C * Further micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + mov un_param, un C free up rdx + neg un + + mov (up), %rax C shared for mul_1 and mul_2 + lea (up,un_param,8), up C point at operand end + lea (rp,un_param,8), rp C point at rp[un-1] + + mov (vp), v0 C shared for mul_1 and mul_2 + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn) + jz L(do_mul_2) + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... + mov %rdx, w1 + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(m110) + +L(m100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(m1l0) + +L(m110):lea (un), n C un = 2, 6, 10, ... + jmp L(m1l2) + +L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... + mov %rdx, w0 + test $2, R8(un) + jz L(m111) + +L(m101):lea 3(un), n C un = 1, 5, 9, ... + test n, n + js L(m1l1) + mov %rax, -8(rp) + mov %rdx, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(m111):lea 1(un), n C un = 3, 7, 11, ... + mov 8(up,un,8), %rax + jmp L(m1l3) + + ALIGN(16) +L(m1tp):mov %rdx, w0 + add %rax, w1 +L(m1l1):mov -16(up,n,8), %rax + adc $0, w0 + mul v0 + add %rax, w0 + mov w1, -24(rp,n,8) + mov -8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(m1l0):mul v0 + mov w0, -16(rp,n,8) + add %rax, w1 + mov %rdx, w0 + mov (up,n,8), %rax + adc $0, w0 +L(m1l3):mul v0 + mov w1, -8(rp,n,8) + mov %rdx, w1 + add %rax, w0 + mov 8(up,n,8), %rax + adc $0, w1 +L(m1l2):mul v0 + mov w0, (rp,n,8) + add $4, n + jnc L(m1tp) + +L(m1ed):add %rax, w1 + adc $0, %rdx + mov w1, I(-8(rp),-24(rp,n,8)) + mov %rdx, I((rp),-16(rp,n,8)) + + dec R32(vn) + jz L(ret2) + + lea 8(vp), vp + lea 8(rp), rp + push %r12 + push %r13 + push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') + push %r12 + push %r13 + push %r14 + + mov 8(vp), v1 + + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea (un), n + mov %rax, w2 C 0 + mov (up,un,8), %rax + mov %rdx, w1 C 1 + mul v1 + mov %rax, w0 C 1 + mov w2, (rp,un,8) C 0 + mov 8(up,un,8), %rax + mov %rdx, w2 C 2 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + mov %rax, w0 C 1 + mov %rdx, w3 C 2 + mov (up,un,8), %rax + mul v1 + mov w0, (rp,un,8) C 1 + mov %rdx, w0 C 3 + mov %rax, w2 C 0 + mov 8(up,un,8), %rax + jmp L(m2l1) + + ALIGN(32) +L(m2tp):add %rax, w2 C 0 + mov (up,n,8), %rax + adc $0, w0 C 1 +L(m2l1):mul v0 + add %rax, w2 C 0 + mov (up,n,8), %rax + mov %rdx, w1 C 1 + adc $0, w1 C 1 + mul v1 + add w3, w2 C 0 + adc $0, w1 C 1 + add %rax, w0 C 1 + mov w2, (rp,n,8) C 0 + mov 8(up,n,8), %rax + mov %rdx, w2 C 2 + adc $0, w2 C 2 +L(m2l0):mul v0 + add %rax, w0 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + add w1, w0 C 1 + adc $0, w3 C 2 + mov 8(up,n,8), %rax + mul v1 + add $2, n + mov w0, -8(rp,n,8) C 1 + mov %rdx, w0 C 3 + jnc L(m2tp) + +L(m2ed):add %rax, w2 + adc $0, %rdx + add w3, w2 + adc $0, %rdx + mov w2, I((rp),(rp,n,8)) + mov %rdx, I(8(rp),8(rp,n,8)) + + add $-2, R32(vn) + jz L(ret5) + + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + + mov (up,un,8), %rax + mul v0 + + test $1, R8(un) + jnz L(bx1) + +L(bx0): mov %rax, X1 + mov (up,un,8), %rax + mov %rdx, X0 + mul v1 + test $2, R8(un) + jnz L(b10) + +L(b00): lea (un), n C un = 4, 8, 12, ... + mov (rp,un,8), w3 + mov %rax, w0 + mov 8(up,un,8), %rax + mov %rdx, w1 + jmp L(lo0) + +L(b10): lea 2(un), n C un = 2, 6, 10, ... + mov (rp,un,8), w1 + mov %rdx, w3 + mov %rax, w2 + mov 8(up,un,8), %rax + jmp L(lo2) + +L(bx1): mov %rax, X0 + mov (up,un,8), %rax + mov %rdx, X1 + mul v1 + test $2, R8(un) + jz L(b11) + +L(b01): lea 1(un), n C un = 1, 5, 9, ... + mov (rp,un,8), w2 + mov %rdx, w0 + mov %rax, w3 + jmp L(lo1) + +L(b11): lea -1(un), n C un = 3, 7, 11, ... + mov (rp,un,8), w0 + mov %rax, w1 + mov 8(up,un,8), %rax + mov %rdx, w2 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,n,8) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up,n,8), %rax + mul v1 + mov -8(rp,n,8), w1 + mov %rdx, w0 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X0, -8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + mov (up,n,8), %rax + adc $0, X0 + mov (rp,n,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(lo0): mul v0 + add w3, X1 + mov X1, (rp,n,8) + adc %rax, X0 + mov 8(up,n,8), %rax + mov %rdx, X1 + adc $0, X1 + mov 8(rp,n,8), w3 + mul v1 + add w3, w0 + adc %rax, w1 + mov 16(up,n,8), %rax + mov %rdx, w2 + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + adc $0, X0 + mov 16(up,n,8), %rax + mov 16(rp,n,8), w0 + mul v1 + mov %rdx, w3 + add w0, w1 + adc %rax, w2 + adc $0, w3 + mov 24(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + add w1, X1 + mov X1, I(-16(rp),-16(rp,n,8)) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + mov I(-8(rp),-8(rp,n,8)), w1 + add w1, w2 + adc %rax, w3 + adc $0, %rdx + add w2, X0 + adc $0, X1 + mov X0, I(-8(rp),-8(rp,n,8)) + add w3, X1 + mov X1, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + + addl $-2, vn + lea 16(vp), vp + lea 16(rp), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 + pop %r13 + pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm new file mode 100644 index 0000000..7b084f4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm @@ -0,0 +1,191 @@ +dnl AMD64 SSSE3/XOP mpn_popcount -- population count. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.27 y +C AMD bd2 1.24 y +C AMD bd3 ? +C AMD bd4 1.22 +C AMD zen n/a +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL n/a +C Intel SKL n/a +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we +C intend to support old systems. + +C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some +C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. +C We fall back to the core2 code. +ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/core2/popcount.asm') +',` + +define(`up', `%rdi') +define(`n', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + FUNC_ENTRY(3) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)', + `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)') + movdqa OFF1`'(%r9), %xmm7 C nibble counts table + movdqa OFF2`'(%r9), %xmm6 C splat shift counts + movdqa OFF3`'(%r9), %xmm9 C masks + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 C 0-reg + pxor %xmm8, %xmm8 C grand total count + + xor R32(%rdx), R32(%rdx) + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up),%rdx + add $8, up + dec n + jnz L(top) + mov %rdx, %rax + FUNC_EXIT() + ret + +L(2): add $-48, up + jmp L(e2) + +L(3): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-40, up + jmp L(e2) + +L(4): add $-32, up + jmp L(e4) + +L(5): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-24, up + jmp L(e4) + +L(6): add $-16, up + jmp L(e6) + +L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-8, up + jmp L(e6) + + ALIGN(32) +L(top): lddqu (up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2 + .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 + .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 + paddb %xmm2, %xmm4 +L(e2): popcnt 48(up), %r8 + popcnt 56(up), %r9 + add $64, up + paddq %xmm5, %xmm8 C sum to 2 x 64-bit counts + add %r8, %rdx + add %r9, %rdx + sub $8, n + jg L(top) + + .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 + paddq %xmm5, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + add %rdx, %rax + FUNC_EXIT() + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) +') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm new file mode 100644 index 0000000..4ba673d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_sublsh1_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc) +include_mpn(`x86_64/atom/sublsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm new file mode 100644 index 0000000..b167077 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd2/gcd_11.asm @@ -0,0 +1,96 @@ +dnl AMD64 mpn_gcd_11 optimised for AMD BD2, BD3, BT2. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 5.4 +C AMD bd2 3.72 +C AMD bd3 ? +C AMD bd4 4.12 +C AMD bt1 9.0 +C AMD bt2 3.97 +C AMD zn1 3.36 +C AMD zn2 3.33 +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + mov v0, %rdx + sub u0, %rdx + jz L(end) + + ALIGN(16) +L(top): rep;bsf %rdx, %rcx C tzcnt! + mov u0, %rax + sub v0, u0 C u - v + cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 + mov v0, %rdx + sub u0, %rdx C v - u + jnz L(top) + +L(end): mov v0, %rax + C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm new file mode 100644 index 0000000..a4f30ea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd2/gcd_22.asm @@ -0,0 +1,142 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 12.3 +C AMD K10 8.0 +C AMD bd1 10.0 +C AMD bd2 7.2 +C AMD bd3 ? +C AMD bd4 6.7 +C AMD bt1 13.6 +C AMD bt2 8.9 +C AMD zn1 5.7 +C AMD zn2 5.6 +C Intel P4 ? +C Intel CNR 9.7 +C Intel PNR 9.7 +C Intel NHM 9.4 +C Intel WSM 9.5 +C Intel SBR 10.3 +C Intel IBR ? +C Intel HWL 8.2 +C Intel BWL 7.4 +C Intel SKL 7.3 +C Intel atom 26.5 +C Intel SLM 17.4 +C Intel GLM 13.4 +C Intel GLM+ 12.4 +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + rep;bsf t0, cnt C tzcnt! + mov u0, s0 + mov u1, s1 + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + +C Rightshift (u1,,u0) into (u1,,u0) +L(shr): shr R8(cnt), u0 + mov u1, t1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + rep;bsf t0, cnt C tzcnt! + mov u0, s0 + mov u1, s1 + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h new file mode 100644 index 0000000..61573ea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd2/gmp-mparam.h @@ -0,0 +1,263 @@ +/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 4000-4200 MHz Piledriver Vishera */ +/* FFT tuning limit = 464,626,631 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 23 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 34 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 293 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 152 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 309 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 103 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 142 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 200 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 430 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 55,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79, 8}, \ + { 639, 9}, { 335,10}, { 175, 9}, { 351,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 287,11}, { 159,12}, { 95,11}, \ + { 191,13}, { 63,12}, { 127,11}, { 271,10}, \ + { 543,11}, { 287,12}, { 159,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,12}, { 319,11}, { 639,10}, { 1279,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,14}, \ + { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,10}, { 2175,12}, { 575,11}, \ + { 1151,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,10}, { 4863,13}, { 639,12}, { 1343,11}, \ + { 2687,13}, { 703,12}, { 1407,11}, { 2815,14}, \ + { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \ + { 1727,11}, { 3455,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \ + { 2431,11}, { 4863,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1407,12}, { 2815,13}, { 1471,12}, \ + { 2943,11}, { 5887,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1727,12}, { 3455,14}, { 895,13}, \ + { 1791,12}, { 3583,13}, { 1919,12}, { 3839,11}, \ + { 7679,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,12}, { 5631,13}, { 2943,12}, { 5887,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,12}, { 7679,16}, { 511,15}, \ + { 1023,14}, { 2175,13}, { 4479,14}, { 2303,13}, \ + { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2815,13}, { 5631,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4479,13}, { 8959,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,14}, \ + { 15871,17}, { 2047,16}, { 4095,15}, { 8959,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 262 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 344 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 344, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63, 9}, \ + { 511,10}, { 271,11}, { 143,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,13}, { 63,12}, \ + { 127,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 351,12}, { 191,11}, { 383,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,12}, { 287,11}, { 575,10}, \ + { 1151,11}, { 607,12}, { 319,11}, { 639,10}, \ + { 1279,12}, { 351,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,10}, { 2175,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,10}, \ + { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1343,11}, { 2687,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \ + { 831,12}, { 1727,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,11}, { 4863,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,13}, { 2943,12}, { 5887,11}, { 11775,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \ + { 5631,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4479,13}, { 8959,15}, { 2303,14}, { 4863,15}, \ + { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \ + { 15359,15}, { 7935,14}, { 15871,17}, { 2047,16}, \ + { 4095,15}, { 8959,16}, { 4607,15}, { 9983,14}, \ + { 19967,16}, { 5119,15}, { 10239,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 254 +#define SQR_FFT_THRESHOLD 2880 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 30 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 53 +#define SQRLO_SQR_THRESHOLD 5724 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 159 +#define DC_BDIV_QR_THRESHOLD 44 +#define DC_BDIV_Q_THRESHOLD 79 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 172 +#define INV_APPR_THRESHOLD 172 + +#define BINV_NEWTON_THRESHOLD 226 +#define REDC_1_TO_REDC_2_THRESHOLD 40 +#define REDC_2_TO_REDC_N_THRESHOLD 51 + +#define MU_DIV_QR_THRESHOLD 1308 +#define MU_DIVAPPR_Q_THRESHOLD 1258 +#define MUPI_DIV_QR_THRESHOLD 85 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1210 + +#define POWM_SEC_TABLE 3,16,129,523,1297 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 228 +#define SET_STR_PRECOMPUTE_THRESHOLD 1033 + +#define FAC_DSC_THRESHOLD 172 +#define FAC_ODD_THRESHOLD 28 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 8.54% faster than 3 */ +#define HGCD_THRESHOLD 108 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 393 +#define GCDEXT_DC_THRESHOLD 278 +#define JACOBI_BASE_METHOD 4 /* 13.69% faster than 1 */ + +/* Tuneup completed successfully, took 463931 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm new file mode 100644 index 0000000..ff0d27b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd4/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/zen/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm new file mode 100644 index 0000000..4176b85 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd4/gcd_11.asm @@ -0,0 +1,96 @@ +dnl AMD64 mpn_gcd_11 optimised for AMD BD4, ZN1. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 3.73 +C AMD bt1 - +C AMD bt2 - +C AMD zn1 3.33 +C AMD zn2 3.48 +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C Intel GLM - +C Intel GLM+ - +C VIA nano - + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + mov u0, %rax + mov v0, %rdx + sub u0, %rdx C v - u + jz L(end) + + ALIGN(16) +L(top): rep;bsf %rdx, %rcx C tzcnt! + sub v0, u0 C u - v + cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shrx( %rcx, u0, %rax) + shrx( %rcx, u0, u0) + mov v0, %rdx + sub %rax, %rdx C v - u + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm new file mode 100644 index 0000000..5dfd9e3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd4/gcd_22.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/coreihwl/gcd_22.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h new file mode 100644 index 0000000..9d2038c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bd4/gmp-mparam.h @@ -0,0 +1,266 @@ +/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3800-4200 MHz Excavator/Bristol Ridge */ +/* FFT tuning limit = 461,179,335 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 52 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 25 + +#define DIV_1_VS_MUL_1_PERCENT 298 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 142 +#define MUL_TOOM6H_THRESHOLD 206 +#define MUL_TOOM8H_THRESHOLD 292 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 83 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 102 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 71 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 298 +#define SQR_TOOM8_THRESHOLD 466 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 14 + +#define MUL_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 316, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 87,11}, \ + { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135, 9}, { 271, 5}, { 4351, 6}, { 2303, 7}, \ + { 1215, 8}, { 639,10}, { 175,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415, 9}, \ + { 831,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 543,11}, { 1087,12}, { 607,13}, \ + { 319,12}, { 671,11}, { 1343,10}, { 2687,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1151,11}, { 2303,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1535,13}, { 3071,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ + { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,17}, { 2047,16}, { 4095,15}, { 8191,16}, \ + { 4607,15}, { 9983,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 253 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_MODF_THRESHOLD 300 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 300, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \ + { 271, 8}, { 543,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175,11}, { 95,10}, \ + { 191, 9}, { 383, 5}, { 6399, 6}, { 3327, 7}, \ + { 1727, 6}, { 3455, 7}, { 1791,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415, 9}, { 831,13}, { 63,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \ + { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1151,11}, { 2303,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \ + { 5631,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4863,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3583,14}, { 7167,15}, \ + { 3839,14}, { 7679,17}, { 1023,16}, { 2047,15}, \ + { 4095,14}, { 8191,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4095,15}, { 8447,16}, { 4607,15}, { 9983,16}, \ + { 5119,15}, { 10239,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 273 +#define SQR_FFT_THRESHOLD 2752 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 43 +#define MULLO_MUL_N_THRESHOLD 8397 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 54 +#define SQRLO_SQR_THRESHOLD 5397 + +#define DC_DIV_QR_THRESHOLD 39 +#define DC_DIVAPPR_Q_THRESHOLD 165 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 155 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 28 +#define REDC_2_TO_REDC_N_THRESHOLD 43 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 66 +#define MU_BDIV_QR_THRESHOLD 998 +#define MU_BDIV_Q_THRESHOLD 1142 + +#define POWM_SEC_TABLE 1,16,175,269,839,1420 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 552 +#define SET_STR_PRECOMPUTE_THRESHOLD 1038 + +#define FAC_DSC_THRESHOLD 151 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 1 /* 8.11% faster than 3 */ +#define HGCD_THRESHOLD 87 +#define HGCD_APPR_THRESHOLD 96 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 327 +#define GCDEXT_DC_THRESHOLD 241 +#define JACOBI_BASE_METHOD 4 /* 21.40% faster than 1 */ + +/* Tuneup completed successfully, took 431056 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm new file mode 100644 index 0000000..a53bd52 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bdiv_dbm1c.asm @@ -0,0 +1,106 @@ +dnl x86_64 mpn_bdiv_dbm1. + +dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.25 +C AMD K10 2.25 +C Intel P4 12.5 +C Intel core2 4 +C Intel NHM 3.75 +C Intel SBR 3.6 +C Intel atom 20 +C VIA nano 4 + +C TODO +C * Optimise feed-in code. + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') +define(`bd', `%rcx') +define(`cy', `%r8') + +define(`n', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_dbm1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + mov (up), %rax + mov n_param, n + mov R32(n_param), R32(%r11) + mul bd + lea (up,n,8), up + lea (qp,n,8), qp + neg n + and $3, R32(%r11) + jz L(lo0) + lea -4(n,%r11), n + cmp $2, R32(%r11) + jc L(lo1) + jz L(lo2) + jmp L(lo3) + + ALIGN(16) +L(top): mov (up,n,8), %rax + mul bd +L(lo0): sub %rax, %r8 + mov %r8, (qp,n,8) + sbb %rdx, %r8 + mov 8(up,n,8), %rax + mul bd +L(lo3): sub %rax, %r8 + mov %r8, 8(qp,n,8) + sbb %rdx, %r8 + mov 16(up,n,8), %rax + mul bd +L(lo2): sub %rax, %r8 + mov %r8, 16(qp,n,8) + sbb %rdx, %r8 + mov 24(up,n,8), %rax + mul bd +L(lo1): sub %rax, %r8 + mov %r8, 24(qp,n,8) + sbb %rdx, %r8 + add $4, n + jnz L(top) + + mov %r8, %rax + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm new file mode 100644 index 0000000..85538c9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bdiv_q_1.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Copyright 2001, 2002, 2004-2006, 2010-2012, 2017 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C norm unorm +C AMD K8,K9 11 11 +C AMD K10 11 11 +C AMD bull 13.5 14 +C AMD pile 14 15 +C AMD steam +C AMD excavator +C AMD bobcat 14 14 +C AMD jaguar 14.5 15 +C Intel P4 33 33 +C Intel core2 13.5 13.25 +C Intel NHM 14 14 +C Intel SBR 8 8.25 +C Intel IBR 7.75 7.85 +C Intel HWL 8 8 +C Intel BWL 8 8 +C Intel SKL 8 8 +C Intel atom 34 36 +C Intel SLM 13.7 13.5 +C VIA nano 19.25 19.25 needs re-measuring + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`d', `%rcx') +define(`di', `%r8') C just mpn_pi1_bdiv_q_1 +define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C ncnt count + mov %rdx, %r10 + + bt $0, R32(%rax) + jnc L(evn) C skip bsf unless divisor is even + +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r8 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits + + jmp L(pi1) + +L(evn): bsf %rax, %rcx + shr R8(%rcx), %rax + jmp L(odd) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbx + + mov %rcx, %r11 C d + mov %rdx, %r10 C n + mov %r9, %rcx C ncnt + +L(pi1): mov (up), %rax C up[0] + + dec %r10 + jz L(one) + + lea 8(up,%r10,8), up C up end + lea (rp,%r10,8), rp C rp end + neg %r10 C -n + + test R32(%rcx), R32(%rcx) + jnz L(unorm) C branch if count != 0 + xor R32(%rbx), R32(%rbx) + jmp L(nent) + + ALIGN(8) +L(ntop):mul %r11 C carry limb in rdx 0 10 + mov -8(up,%r10,8), %rax C + sub %rbx, %rax C apply carry bit + setc R8(%rbx) C + sub %rdx, %rax C apply carry limb 5 + adc $0, R32(%rbx) C 6 +L(nent):imul %r8, %rax C 6 + mov %rax, (rp,%r10,8) C + inc %r10 C + jnz L(ntop) + + mov -8(up), %r9 C up high limb + jmp L(com) + +L(unorm): + mov (up,%r10,8), %r9 C up[1] + shr R8(%rcx), %rax C + neg R32(%rcx) + shl R8(%rcx), %r9 C + neg R32(%rcx) + or %r9, %rax + xor R32(%rbx), R32(%rbx) + jmp L(uent) + + ALIGN(8) +L(utop):mul %r11 C carry limb in rdx 0 10 + mov (up,%r10,8), %rax C + shl R8(%rcx), %rax C + neg R32(%rcx) + or %r9, %rax + sub %rbx, %rax C apply carry bit + setc R8(%rbx) C + sub %rdx, %rax C apply carry limb 5 + adc $0, R32(%rbx) C 6 +L(uent):imul %r8, %rax C 6 + mov (up,%r10,8), %r9 C + shr R8(%rcx), %r9 C + neg R32(%rcx) + mov %rax, (rp,%r10,8) C + inc %r10 C + jnz L(utop) + +L(com): mul %r11 C carry limb in rdx + sub %rbx, %r9 C apply carry bit + sub %rdx, %r9 C apply carry limb + imul %r8, %r9 + mov %r9, (rp) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm b/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm new file mode 100644 index 0000000..9b6b5c7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/aors_n.asm @@ -0,0 +1,159 @@ +dnl AMD64 mpn_add_n, mpn_sub_n optimised for bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1.77 +C AMD K10 1.76\1.82 +C AMD bd1 1.67\2.12 +C AMD bd2 1.62\1.82 +C AMD bd3 +C AMD bd4 1.55\2.2 +C AMD zen +C AMD bt1 2.54 +C AMD bt2 2 +C Intel P4 11 +C Intel PNR 4.76 +C Intel NHM 5.27 +C Intel SBR 2 +C Intel IBR 1.94 +C Intel HWL 1.63 +C Intel BWL 1.51 +C Intel SKL 1.51 +C Intel atom 3.56 +C Intel SLM 4 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C INPUT PARAMETERS +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +L(ent): test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): shr $2, n + neg %r8 + mov $3, R32(%rax) + mov (up), %r10 + mov 8(up), %r11 + jmp L(lo0) + +L(b10): shr $2, n + neg %r8 + mov $1, R32(%rax) + mov (up), %r8 + mov 8(up), %r9 + jrcxz L(cj2) + jmp L(top) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): shr $2, n + neg %r8 + mov $0, R32(%rax) + mov (up), %r9 + jrcxz L(cj1) + mov 8(up), %r10 + jmp L(lo1) + + ALIGN(8) +L(b11): inc n + shr $2, n + neg %r8 + mov $2, R32(%rax) + mov (up), %r11 + jmp L(lo3) + + ALIGN(4) +L(top): mov 8(up,%rax,8), %r10 + ADCSBB -8(vp,%rax,8), %r8 + mov %r8, -8(rp,%rax,8) +L(lo1): mov 16(up,%rax,8), %r11 + ADCSBB (vp,%rax,8), %r9 + lea 4(%rax), %rax + mov %r9, -32(rp,%rax,8) +L(lo0): ADCSBB -24(vp,%rax,8), %r10 + mov %r10, -24(rp,%rax,8) +L(lo3): ADCSBB -16(vp,%rax,8), %r11 + dec n + mov -8(up,%rax,8), %r8 + mov %r11, -16(rp,%rax,8) +L(lo2): mov (up,%rax,8), %r9 + jnz L(top) + +L(cj2): ADCSBB -8(vp,%rax,8), %r8 + mov %r8, -8(rp,%rax,8) +L(cj1): ADCSBB (vp,%rax,8), %r9 + mov %r9, (rp,%rax,8) + + mov $0, R32(%rax) + adc $0, R32(%rax) + + FUNC_EXIT() + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm new file mode 100644 index 0000000..41e1d8a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/aorsmul_1.asm @@ -0,0 +1,191 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.52 old measurement +C AMD K10 4.51 old measurement +C AMD bd1 4.66 old measurement +C AMD bd2 4.57 old measurement +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bt1 5.04 +C AMD bt2 5.07 +C Intel P4 16.8 18.6 old measurement +C Intel PNR 5.59 old measurement +C Intel NHM 5.39 old measurement +C Intel SBR 3.93 old measurement +C Intel IBR 3.59 old measurement +C Intel HWL 3.61 old measurement +C Intel BWL 2.76 old measurement +C Intel SKL 2.77 old measurement +C Intel atom 23 old measurement +C Intel SLM 8 old measurement +C Intel GLM ? +C VIA nano 5.63 old measurement + +C The ALIGNment here might look completely ad-hoc. They are not. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') +define(`v0', `%rcx') +C Standard allocations +define(`n', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C DOS64 parameters +IFDOS(` define(`rp', `%rcx') ') dnl +IFDOS(` define(`up', `%rsi') ') dnl +IFDOS(` define(`n_param', `%r8') ') dnl +IFDOS(` define(`v0', `%r9') ') dnl +C DOS64 allocations +IFDOS(` define(`n', `%rbx') ') dnl +IFDOS(` define(`w0', `%r8') ') dnl +IFDOS(` define(`w1', `%rdi') ') dnl +IFDOS(` define(`w2', `%r10') ') dnl +IFDOS(` define(`w3', `%r11') ') dnl + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(func) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(bx1) + +L(bx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + test $2, R8(n) + jne L(L2) + +L(b00): add $2, n + jmp L(L0) + + ALIGN(16) +L(bx1): mul v0 + test $2, R8(n) + je L(b01) + +L(b11): mov %rax, w2 + mov %rdx, w3 + neg n + inc n + jmp L(L3) + + ALIGN(16) +L(b01): sub $3, n + jc L(n1) + mov %rax, w2 + mov %rdx, w3 + neg n + + ALIGN(16) +L(top): mov -16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + ADDSUB w2, -24(rp,n,8) + adc w3, w0 + adc $0, w1 +L(L0): mov -8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + ADDSUB w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(L3): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + ADDSUB w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(L2): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + ADDSUB w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + add $4, n + js L(top) + +L(end): xor R32(%rax), R32(%rax) + ADDSUB w2, -8(rp) + adc w3, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret + + ALIGN(32) +L(n1): ADDSUB %rax, -8(rp) + mov $0, R32(%rax) + adc %rdx, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm b/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm new file mode 100644 index 0000000..877714e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/copyd.asm @@ -0,0 +1,91 @@ +dnl AMD64 mpn_copyd optimised for AMD bobcat. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1-2 (alignment fluctuations) +C AMD bd1 ? +C AMD bobcat 1.5 +C Intel P4 2.8 +C Intel core2 1 +C Intel NHM 1-1.25 +C Intel SBR 1 +C Intel atom 2.87 +C VIA nano 2 + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx + +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + sub $4, n + jl L(end) + ALIGN(16) +L(top): mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + mov (up,n,8), %r8 + mov %r8, (rp,n,8) +L(ent): sub $4, n + jge L(top) + +L(end): cmp $-4, R32(n) + jz L(ret) + mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) + cmp $-3, R32(n) + jz L(ret) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + cmp $-2, R32(n) + jz L(ret) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm b/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm new file mode 100644 index 0000000..ee0f578 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/copyi.asm @@ -0,0 +1,94 @@ +dnl AMD64 mpn_copyi optimised for AMD bobcat. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1-2 (alignment fluctuations) +C AMD bd1 ? +C AMD bobcat 1.5 +C Intel P4 2.8 +C Intel core2 1 +C Intel NHM 1-1.25 +C Intel SBR 1 +C Intel atom 2.87 +C VIA nano 2 + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx + +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + lea -32(up,n,8), up + lea -32(rp,n,8), rp + neg n + add $4, n + jg L(end) + ALIGN(16) +L(top): mov (up,n,8), %r8 + mov %r8, (rp,n,8) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) +L(ent): add $4, n + jle L(top) + +L(end): cmp $4, R32(n) + jz L(ret) + mov (up,n,8), %r8 + mov %r8, (rp,n,8) + cmp $3, R32(n) + jz L(ret) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + cmp $2, R32(n) + jz L(ret) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm new file mode 100644 index 0000000..ef53392 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/gcd_11.asm @@ -0,0 +1,119 @@ +dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 5.4 +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 8) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u0', `%rdi') +define(`v0', `%rsi') + +define(`cnt', `%rcx') +define(`s0', `%rax') +define(`t0', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + LEA( ctz_table, %r10) + mov v0, t0 + sub u0, t0 + jz L(end) + + ALIGN(16) +L(top): mov u0, s0 + sub v0, u0 + cmovc t0, u0 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + and $MASK, R32(t0) + movzbl (%r10,t0), R32(cnt) + jz L(count_better) +L(shr): shr R8(cnt), u0 + mov v0, t0 + sub u0, t0 + jnz L(top) + +L(end): mov v0, %rax + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret + +L(count_better): + bsf u0, cnt + jmp L(shr) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm new file mode 100644 index 0000000..c9f221e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/gcd_22.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/gcd_22.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h new file mode 100644 index 0000000..977a209 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/gmp-mparam.h @@ -0,0 +1,230 @@ +/* AMD Bobcat gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 1600 MHz AMD Bobcat/Zacate */ +/* FFT tuning limit = 110,472,704 */ +/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 71 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 270 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 66 +#define MUL_TOOM44_THRESHOLD 190 +#define MUL_TOOM6H_THRESHOLD 274 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 127 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 131 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 278 +#define SQR_TOOM6_THRESHOLD 372 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83, 5}, { 1343, 4}, \ + { 2687, 5}, { 1407, 6}, { 735, 7}, { 415, 8}, \ + { 223,10}, { 79,11}, { 47,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4991,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 183 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 380, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 6}, { 1087, 7}, { 575, 8}, \ + { 303, 9}, { 159,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \ + { 79,10}, { 159, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,12}, { 223,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 607,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4479,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 186 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 42 +#define MULLO_MUL_N_THRESHOLD 10950 +#define SQRLO_BASECASE_THRESHOLD 7 +#define SQRLO_DC_THRESHOLD 100 +#define SQRLO_SQR_THRESHOLD 7293 + +#define DC_DIV_QR_THRESHOLD 70 +#define DC_DIVAPPR_Q_THRESHOLD 204 +#define DC_BDIV_QR_THRESHOLD 59 +#define DC_BDIV_Q_THRESHOLD 148 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 246 +#define INV_APPR_THRESHOLD 236 + +#define BINV_NEWTON_THRESHOLD 252 +#define REDC_1_TO_REDC_2_THRESHOLD 67 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 108 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,16,194,960,1603,1811,2499 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 34 +#define SET_STR_DC_THRESHOLD 345 +#define SET_STR_PRECOMPUTE_THRESHOLD 1787 + +#define FAC_DSC_THRESHOLD 781 +#define FAC_ODD_THRESHOLD 104 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 3 /* 3.20% faster than 5 */ +#define HGCD_THRESHOLD 110 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 474 +#define GCDEXT_DC_THRESHOLD 293 +#define JACOBI_BASE_METHOD 2 /* 9.38% faster than 1 */ + +/* Tuneup completed successfully, took 358881 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm new file mode 100644 index 0000000..4394d6e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/mul_1.asm @@ -0,0 +1,241 @@ +dnl AMD64 mpn_mul_1 optimised for AMD bt1/bt2. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.53 old measurement +C AMD K10 4.53 old measurement +C AMD bd1 4.56 old measurement +C AMD bd2 4.47 old measurement +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bt1 5.12 +C AMD bt2 5.17 +C Intel P4 12.6 old measurement +C Intel PNR 4.53 old measurement +C Intel NHM 4.36 old measurement +C Intel SBR 3.0 old measurement +C Intel IBR 2.55 old measurement +C Intel HWL 2.28 old measurement +C Intel BWL 2.36 old measurement +C Intel SKL 2.39 old measurement +C Intel atom 21.0 old measurement +C Intel SLM 9 old measurement +C Intel GLM ? +C VIA nano ? + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') +define(`v0', `%rcx') +define(`cy', `%r8') +C Standard allocations +define(`n', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C DOS64 parameters +IFDOS(` define(`rp', `%rcx') ') dnl +IFDOS(` define(`up', `%rsi') ') dnl +IFDOS(` define(`n_param', `%r8') ') dnl +IFDOS(` define(`v0', `%r9') ') dnl +IFDOS(` define(`cy', `56(%rsp)')') dnl +C DOS64 allocations +IFDOS(` define(`n', `%rbx') ') dnl +IFDOS(` define(`w0', `%r8') ') dnl +IFDOS(` define(`w1', `%rdi') ') dnl +IFDOS(` define(`w2', `%r10') ') dnl +IFDOS(` define(`w3', `%r11') ') dnl + + ALIGN(64) +PROLOGUE(mpn_mul_1) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(bx1) + +L(bx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + test $2, R8(n) + jne L(L2) + +L(b00): add $2, n + jmp L(L0) + + ALIGN(16) +L(b11): mov %rax, w2 + mov %rdx, w3 + neg n + inc n + jmp L(L3) + + ALIGN(16) +L(bx1): mul v0 + test $2, R8(n) + jne L(b11) + +L(b01): sub $3, n + jc L(n1) + mov %rax, w2 + mov %rdx, w3 + neg n + + ALIGN(16) +L(top): mov -16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -24(rp,n,8) + add w3, w0 + adc $0, w1 +L(L0): mov -8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 +L(L2): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 + add $4, n + js L(top) + +L(end): mov w2, -8(rp) + mov w3, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret + + ALIGN(32) +L(n1): mov %rax, -8(rp) + mov %rdx, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_mul_1c) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + mov cy, w2 + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(cx1) + +L(cx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + add w2, w0 + adc $0, w1 + test $2, R8(n) + jne L(L2) + +L(c00): add $2, n + jmp L(L0) + + ALIGN(16) +L(cx1): mul v0 + test $2, R8(n) + je L(c01) + +L(c11): neg n + inc n + add %rax, w2 + mov %rdx, w3 + adc $0, w3 + jmp L(L3) + +L(c01): cmp $1, n + jz L(m1) + neg n + add $3, n + add %rax, w2 + mov %rdx, w3 + adc $0, w3 + jmp L(top) + + ALIGN(32) +L(m1): add %rax, w2 + mov %rdx, %rax + mov w2, -8(rp) + adc $0, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm new file mode 100644 index 0000000..e7d46bf --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/mul_basecase.asm @@ -0,0 +1,486 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bd1 4.75 +C AMD bobcat 5 +C Intel P4 17.7 +C Intel core2 5.5 +C Intel NHM 5.43 +C Intel SBR 3.92 +C Intel atom 23 +C VIA nano 5.63 + +C This mul_basecase is based on mul_1 and addmul_1, since these both run at the +C multiply insn bandwidth, without any apparent loop branch exit pipeline +C replays experienced on K8. The structure is unusual: it falls into mul_1 in +C the same way for all n, then it splits into 4 different wind-down blocks and +C 4 separate addmul_1 loops. +C +C We have not tried using the same addmul_1 loops with a switch into feed-in +C code, as we do in other basecase implementations. Doing that could save +C substantial code volume, but would also probably add some overhead. + +C TODO +C * Tune un < 3 code. +C * Fix slowdown for un=vn=3 (67->71) compared to default code. +C * This is 1263 bytes, compared to 1099 bytes for default code. Consider +C combining addmul loops like that code. Tolerable slowdown? +C * Lots of space could be saved by replacing the "switch" code by gradual +C jumps out from mul_1 winddown code, perhaps with no added overhead. +C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') +C Standard allocations +define(`un', `%rbx') +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +C Temp macro for allowing control over indexing. +C Define to return $1 for more conservative ptr handling. +define(`X',`$2') + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + mov (up), %rax + mov (vp), v0 + + cmp $2, un_param + ja L(ge3) + jz L(u2) + + mul v0 C u0 x v0 + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(u2): mul v0 C u0 x v0 + mov %rax, (rp) + mov 8(up), %rax + mov %rdx, w0 + mul v0 + add %rax, w0 + mov %rdx, w1 + adc $0, w1 + cmp $1, R32(vn) + jnz L(u2v2) + mov w0, 8(rp) + mov w1, 16(rp) + FUNC_EXIT() + ret + +L(u2v2):mov 8(vp), v0 + mov (up), %rax + mul v0 + add %rax, w0 + mov w0, 8(rp) + mov %rdx, %r8 C CAUTION: r8 realloc + adc $0, %r8 + mov 8(up), %rax + mul v0 + add w1, %r8 + adc $0, %rdx + add %r8, %rax + adc $0, %rdx + mov %rax, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + +L(ge3): push %rbx + push %rbp + push %r12 + push %r13 + + lea 8(vp), vp + + lea -24(rp,un_param,8), rp + lea -24(up,un_param,8), up + xor R32(un), R32(un) + mov $2, R32(n) + sub un_param, un + sub un_param, n + + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(top): mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, 8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(top) + + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + +C Switch on n into right addmul_l loop + test n, n + jz L(r2) + cmp $2, R32(n) + ja L(r3) + jz L(r0) + jmp L(r1) + + +L(r3): mov w2, X(-8(rp,n,8),16(rp)) + mov w3, X((rp,n,8),24(rp)) + add $2, un + +C outer loop(3) +L(to3): dec vn + jz L(ret) + mov (vp), v0 + mov 8(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al3) + + ALIGN(16) +L(ta3): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 +L(al3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta3) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to3) + + +L(r2): mov X(0(up,n,8),(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),-8(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),8(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),(rp)) + add w1, w2 + adc $0, w3 + mov X(16(up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X(16(rp,n,8),16(rp)) + adc $0, w3 + mov w1, X(24(rp,n,8),24(rp)) + inc un + +C outer loop(2) +L(to2): dec vn + jz L(ret) + mov (vp), v0 + mov 16(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al2) + + ALIGN(16) +L(ta2): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al2): mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta2) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to2) + + +L(r1): mov X(0(up,n,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),8(rp)) + add w1, w2 + adc $0, w3 + mov w2, X(8(rp,n,8),16(rp)) + mov w3, X(16(rp,n,8),24(rp)) + add $4, un + +C outer loop(1) +L(to1): dec vn + jz L(ret) + mov (vp), v0 + mov -8(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al1) + + ALIGN(16) +L(ta1): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(al1): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta1) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to1) + + +L(r0): mov X((up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X((rp,n,8),16(rp)) + mov w1, X(8(rp,n,8),24(rp)) + add $3, un + +C outer loop(0) +L(to0): dec vn + jz L(ret) + mov (vp), v0 + mov (up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al0) + + ALIGN(16) +L(ta0): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al0): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta0) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to0) + + +L(ret): pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm b/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm new file mode 100644 index 0000000..d55b1e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/redc_1.asm @@ -0,0 +1,507 @@ +dnl X86-64 mpn_redc_1 optimised for AMD bobcat. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat 5.0 +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') +define(`w0', `%rbp') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea (up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w1 + add (up,n,8), w2 + adc w3, %rbx + adc $0, w1 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w1, w2 + adc $0, w3 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 +L(e1): mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp1) + +L(ed1): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w1 + add (up,n,8), w2 + adc w3, %rbx + adc $0, w1 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w1, w2 + adc $0, w3 + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 +L(e3): mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp3) + +L(ed3): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): +L(otp0):lea (n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w3 + add (up,n,8), w0 + adc w1, %rbx + adc $0, w3 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w3, w0 + adc $0, w1 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 +L(e0): mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp0) + +L(ed0): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w3 + add (up,n,8), w0 + adc w1, %rbx + adc $0, w3 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w3, w0 + adc $0, w1 + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 +L(e2): mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp2) + +L(ed2): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -8(up), %rax + adc (up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov (up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 8(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -24(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov -8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -16(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -8(up) + mov %r11, -24(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -48(up), %rdx + mov -40(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc -8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm new file mode 100644 index 0000000..0e417a1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt1/sqr_basecase.asm @@ -0,0 +1,565 @@ +dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bd1 4.75 +C AMD bobcat 5 +C Intel P4 17.7 +C Intel core2 5.5 +C Intel NHM 5.43 +C Intel SBR 3.92 +C Intel atom 23 +C VIA nano 5.63 + +C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the +C multiply insn bandwidth, without any apparent loop branch exit pipeline +C replays experienced on K8. The structure is unusual: it falls into mul_1 in +C the same way for all n, then it splits into 4 different wind-down blocks and +C 4 separate addmul_1 loops. +C +C We have not tried using the same addmul_1 loops with a switch into feed-in +C code, as we do in other basecase implementations. Doing that could save +C substantial code volume, but would also probably add some overhead. + +C TODO +C * Tune un < 4 code. +C * Perhaps implement a larger final corner (it is now 2 x 1). +C * Lots of space could be saved by replacing the "switch" code by gradual +C jumps out from mul_1 winddown code, perhaps with no added overhead. +C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +C Standard allocations +define(`un', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') +define(`n', `%rbp') +define(`v0', `%rcx') + +C Temp macro for allowing control over indexing. +C Define to return $1 for more conservative ptr handling. +define(`X',`$2') +dnl define(`X',`$1') + + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + mov (up), %rax + + cmp $2, R32(un_param) + jae L(ge2) + + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(ge2): mov (up), v0 + jnz L(g2) + + mul %rax + mov %rax, (rp) + mov 8(up), %rax + mov %rdx, w0 + mul v0 + add %rax, w0 + mov %rdx, w1 + adc $0, w1 + mov 8(up), v0 + mov (up), %rax + mul v0 + add %rax, w0 + mov w0, 8(rp) + mov %rdx, w0 C CAUTION: r8 realloc + adc $0, w0 + mov 8(up), %rax + mul v0 + add w1, w0 + adc $0, %rdx + add w0, %rax + adc $0, %rdx + mov %rax, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(g2): cmp $3, R32(un_param) + ja L(g3) + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + mov 8(up), %rax + mul %rax + mov %rax, 16(rp) + mov %rdx, 24(rp) + mov 16(up), %rax + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov (up), v0 + mov 8(up), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov 16(up), %rax + mul v0 + xor R32(w2), R32(w2) + add %rax, w1 + adc %rdx, w2 + + mov 8(up), v0 + mov 16(up), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, w2 + adc %rdx, w3 + add w0, w0 + adc w1, w1 + adc w2, w2 + adc w3, w3 + mov $0, R32(v0) + adc v0, v0 + add w0, 8(rp) + adc w1, 16(rp) + adc w2, 24(rp) + adc w3, 32(rp) + adc v0, 40(rp) + FUNC_EXIT() + ret + +L(g3): push %rbx + push %rbp + + mov 8(up), %rax + lea -24(rp,un_param,8), rp + lea -24(up,un_param,8), up + neg un_param + push un_param C for sqr_diag_addlsh1 + lea (un_param), un + lea 3(un_param), n + + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(top): mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, 8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(top) + + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + + test n, n + jz L(r2) + cmp $2, R32(n) + ja L(r3) + jz L(r0) + + +L(r1): mov X((up,n,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),8(rp)) + add w1, w2 + adc $0, w3 + mov w2, X(8(rp,n,8),16(rp)) + mov w3, X(16(rp,n,8),24(rp)) + add $5, un + jmp L(to0) + +L(r2): mov X((up,n,8),(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),-8(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),8(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),(rp)) + add w1, w2 + adc $0, w3 + mov X(16(up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X(16(rp,n,8),16(rp)) + adc $0, w3 + mov w1, X(24(rp,n,8),24(rp)) + add $6, un + jmp L(to1) + +L(r3): mov w2, X(-8(rp,n,8),16(rp)) + mov w3, X((rp,n,8),24(rp)) + add $3, un + jmp L(to2) + +L(r0): mov X((up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X((rp,n,8),16(rp)) + mov w1, X(8(rp,n,8),24(rp)) + add $4, un +C jmp L(to3) +C fall through into main loop + + +L(outer): + mov un, n + mov (up,un,8), v0 + mov 8(up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al3) + + ALIGN(16) +L(ta3): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 +L(al3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta3) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to2): mov un, n + cmp $-4, R32(un) + jnc L(end) + add $4, un + mov 8(up,n,8), v0 + mov 16(up,n,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al2) + + ALIGN(16) +L(ta2): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al2): mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta2) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to1): mov un, n + mov -16(up,un,8), v0 + mov -8(up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al1) + + ALIGN(16) +L(ta1): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(al1): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta1) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to0): mov un, n + mov -8(up,un,8), v0 + mov (up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al0) + + ALIGN(16) +L(ta0): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al0): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta0) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(outer) + + +L(end): mov X(8(up,un,8),(up)), v0 + mov X(16(up,un,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov X(24(up,un,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, X(24(rp,un,8),16(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(32(rp,un,8),24(rp)) + adc $0, w3 + mov X(16(up,un,8),8(up)), v0 + mov X(24(up,un,8),16(up)), %rax + mul v0 + add %rax, w3 + mov w3, X(40(rp,un,8),32(rp)) + adc $0, %rdx + mov %rdx, X(48(rp,un,8),40(rp)) + + +C sqr_diag_addlsh1 + + lea 16(up), up + lea 40(rp), rp + pop n + lea 2(n,n), n + + mov (up,n,4), %rax + mul %rax + xor R32(w2), R32(w2) + + mov 8(rp,n,8), w0 + mov %rax, (rp,n,8) + jmp L(lm) + + ALIGN(8) +L(tsd): add %rbx, w0 + adc %rax, w1 + mov w0, -8(rp,n,8) + mov 8(rp,n,8), w0 + mov w1, (rp,n,8) +L(lm): mov 16(rp,n,8), w1 + adc w0, w0 + adc w1, w1 + lea (%rdx,w2), %rbx + mov 8(up,n,4), %rax + setc R8(w2) + mul %rax + add $2, n + js L(tsd) + +L(esd): add %rbx, w0 + adc %rax, w1 + mov w0, X(-8(rp,n,8),-8(rp)) + mov w1, X((rp,n,8),(rp)) + adc w2, %rdx + mov %rdx, X(8(rp,n,8),8(rp)) + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/bt2/com.asm b/gmp-6.3.0/mpn/x86_64/bt2/com.asm new file mode 100644 index 0000000..87085ea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm b/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm new file mode 100644 index 0000000..83c0618 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm b/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm new file mode 100644 index 0000000..148d0e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm new file mode 100644 index 0000000..0ffb6ca --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/bd2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm new file mode 100644 index 0000000..d693628 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/gcd_22.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/bd2/gcd_22.asm') diff --git a/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h new file mode 100644 index 0000000..3e26726 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/bt2/gmp-mparam.h @@ -0,0 +1,240 @@ +/* AMD Jaguar gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 2050 MHz AMD Jaguar/Kabini */ +/* FFT tuning limit = 225,381,546 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 65 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 15 + +#define DIV_1_VS_MUL_1_PERCENT 267 + +#define MUL_TOOM22_THRESHOLD 25 +#define MUL_TOOM33_THRESHOLD 32 +#define MUL_TOOM44_THRESHOLD 93 +#define MUL_TOOM6H_THRESHOLD 366 +#define MUL_TOOM8H_THRESHOLD 537 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 63 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 172 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 63 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 67 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 434 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 55,11}, \ + { 31,10}, { 63, 6}, { 1087, 8}, { 303, 9}, \ + { 159,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,11}, \ + { 223,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 415,12}, \ + { 223,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 479,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \ + { 607,13}, { 319,12}, { 703,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 895,14}, { 255,13}, \ + { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \ + { 639,12}, { 1279,13}, { 703,14}, { 383,13}, \ + { 831,12}, { 1663,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1343,12}, { 2687,14}, { 767,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \ + { 2687,15}, { 767,14}, { 1663,13}, { 3327,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4223,13}, { 8447,14}, { 4479,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3071,14}, \ + { 6271,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8447,15}, { 4351,14}, { 8959,15}, { 4863,16}, \ + { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 201 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95, 6}, \ + { 1663, 7}, { 895, 9}, { 239, 8}, { 479,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 135, 9}, \ + { 271,11}, { 79, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \ + { 303, 9}, { 607,10}, { 319, 9}, { 639,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \ + { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,16}, { 511,15}, \ + { 1023,14}, { 2175,13}, { 4479,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4991,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4095,14}, { 8191,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 208 +#define SQR_FFT_THRESHOLD 2880 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 63 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 54 +#define SQRLO_SQR_THRESHOLD 5558 + +#define DC_DIV_QR_THRESHOLD 72 +#define DC_DIVAPPR_Q_THRESHOLD 195 +#define DC_BDIV_QR_THRESHOLD 50 +#define DC_BDIV_Q_THRESHOLD 90 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 195 +#define INV_APPR_THRESHOLD 197 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 67 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 104 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define POWM_SEC_TABLE 1,16,194,712,779,2387 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 216 +#define SET_STR_PRECOMPUTE_THRESHOLD 994 + +#define FAC_DSC_THRESHOLD 153 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 1 /* 9.38% faster than 3 */ +#define HGCD_THRESHOLD 77 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 440 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 1 /* 7.74% faster than 4 */ + +/* Tuneup completed successfully, took 495910 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm b/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm new file mode 100644 index 0000000..13a2ab3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/cnd_aors_n.asm @@ -0,0 +1,183 @@ +dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 2 +C AMD bd1 2.32 +C AMD bobcat 3 +C Intel P4 13 +C Intel core2 2.9 +C Intel NHM 2.8 +C Intel SBR 2.4 +C Intel atom 5.33 +C VIA nano 3 + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. +C * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory +C to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use +C ADCSBB-to-memory, again saving 1 insn/limb. +C * This runs optimally at decoder bandwidth on K10. It has not been tuned +C for any other processor. + +C INPUT PARAMETERS +define(`cnd', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +ifdef(`OPERATION_cnd_add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + neg cnd + sbb cnd, cnd C make cnd mask + + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + + mov R32(n), R32(%rax) + neg n + and $3, R32(%rax) + jz L(top) C carry-save reg rax = 0 in this arc + cmp $2, R32(%rax) + jc L(b1) + jz L(b2) + +L(b3): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov 16(vp,n,8), %r14 + and cnd, %r12 + mov (up,n,8), %r10 + and cnd, %r13 + mov 8(up,n,8), %rbx + and cnd, %r14 + mov 16(up,n,8), %rbp + ADDSUB %r12, %r10 + mov %r10, (rp,n,8) + ADCSBB %r13, %rbx + mov %rbx, 8(rp,n,8) + ADCSBB %r14, %rbp + mov %rbp, 16(rp,n,8) + sbb R32(%rax), R32(%rax) C save carry + add $3, n + js L(top) + jmp L(end) + +L(b2): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov (up,n,8), %r10 + and cnd, %r12 + mov 8(up,n,8), %rbx + and cnd, %r13 + ADDSUB %r12, %r10 + mov %r10, (rp,n,8) + ADCSBB %r13, %rbx + mov %rbx, 8(rp,n,8) + sbb R32(%rax), R32(%rax) C save carry + add $2, n + js L(top) + jmp L(end) + +L(b1): mov (vp,n,8), %r12 + mov (up,n,8), %r10 + and cnd, %r12 + ADDSUB %r12, %r10 + mov %r10, (rp,n,8) + sbb R32(%rax), R32(%rax) C save carry + add $1, n + jns L(end) + + ALIGN(16) +L(top): mov (vp,n,8), %r12 + mov 8(vp,n,8), %r13 + mov 16(vp,n,8), %r14 + mov 24(vp,n,8), %r11 + and cnd, %r12 + mov (up,n,8), %r10 + and cnd, %r13 + mov 8(up,n,8), %rbx + and cnd, %r14 + mov 16(up,n,8), %rbp + and cnd, %r11 + mov 24(up,n,8), %r9 + add R32(%rax), R32(%rax) C restore carry + ADCSBB %r12, %r10 + mov %r10, (rp,n,8) + ADCSBB %r13, %rbx + mov %rbx, 8(rp,n,8) + ADCSBB %r14, %rbp + mov %rbp, 16(rp,n,8) + ADCSBB %r11, %r9 + mov %r9, 24(rp,n,8) + sbb R32(%rax), R32(%rax) C save carry + add $4, n + js L(top) + +L(end): neg R32(%rax) + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/com.asm b/gmp-6.3.0/mpn/x86_64/com.asm new file mode 100644 index 0000000..006acaf --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/com.asm @@ -0,0 +1,95 @@ +dnl AMD64 mpn_com. + +dnl Copyright 2004-2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 1.25 +C AMD K10 1.25 +C Intel P4 2.78 +C Intel core2 1.1 +C Intel corei 1.5 +C Intel atom ? +C VIA nano 2 + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_com) + FUNC_ENTRY(3) + movq (up), %r8 + movl R32(%rdx), R32(%rax) + leaq (up,n,8), up + leaq (rp,n,8), rp + negq n + andl $3, R32(%rax) + je L(b00) + cmpl $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): notq %r8 + movq %r8, (rp,n,8) + decq n + jmp L(e11) +L(b10): addq $-2, n + jmp L(e10) + .byte 0x90,0x90,0x90,0x90,0x90,0x90 +L(b01): notq %r8 + movq %r8, (rp,n,8) + incq n + jz L(ret) + +L(oop): movq (up,n,8), %r8 +L(b00): movq 8(up,n,8), %r9 + notq %r8 + notq %r9 + movq %r8, (rp,n,8) + movq %r9, 8(rp,n,8) +L(e11): movq 16(up,n,8), %r8 +L(e10): movq 24(up,n,8), %r9 + notq %r8 + notq %r9 + movq %r8, 16(rp,n,8) + movq %r9, 24(rp,n,8) + addq $4, n + jnc L(oop) +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/copyd.asm b/gmp-6.3.0/mpn/x86_64/copyd.asm new file mode 100644 index 0000000..a5e6e59 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/copyd.asm @@ -0,0 +1,93 @@ +dnl AMD64 mpn_copyd -- copy limb vector, decrementing. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1 +C AMD bd1 1.36 +C AMD bobcat 1.71 +C Intel P4 2-3 +C Intel core2 1 +C Intel NHM 1 +C Intel SBR 1 +C Intel atom 2 +C VIA nano 2 + + +IFSTD(`define(`rp',`%rdi')') +IFSTD(`define(`up',`%rsi')') +IFSTD(`define(`n', `%rdx')') + +IFDOS(`define(`rp',`%rcx')') +IFDOS(`define(`up',`%rdx')') +IFDOS(`define(`n', `%r8')') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyd) + lea -8(up,n,8), up + lea (rp,n,8), rp + sub $4, n + jc L(end) + nop + +L(top): mov (up), %rax + mov -8(up), %r9 + lea -32(rp), rp + mov -16(up), %r10 + mov -24(up), %r11 + lea -32(up), up + mov %rax, 24(rp) + mov %r9, 16(rp) + sub $4, n + mov %r10, 8(rp) + mov %r11, (rp) + jnc L(top) + +L(end): shr R32(n) + jnc 1f + mov (up), %rax + mov %rax, -8(rp) + lea -8(rp), rp + lea -8(up), up +1: shr R32(n) + jnc 1f + mov (up), %rax + mov -8(up), %r9 + mov %rax, -8(rp) + mov %r9, -16(rp) +1: ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/copyi.asm b/gmp-6.3.0/mpn/x86_64/copyi.asm new file mode 100644 index 0000000..bafce7a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/copyi.asm @@ -0,0 +1,92 @@ +dnl AMD64 mpn_copyi -- copy limb vector, incrementing. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1 +C AMD bd1 1.36 +C AMD bobcat 1.71 +C Intel P4 2-3 +C Intel core2 1 +C Intel NHM 1 +C Intel SBR 1 +C Intel atom 2 +C VIA nano 2 + + +IFSTD(`define(`rp',`%rdi')') +IFSTD(`define(`up',`%rsi')') +IFSTD(`define(`n', `%rdx')') + +IFDOS(`define(`rp',`%rcx')') +IFDOS(`define(`up',`%rdx')') +IFDOS(`define(`n', `%r8')') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) + .byte 0,0,0,0,0,0 +PROLOGUE(mpn_copyi) + lea -8(rp), rp + sub $4, n + jc L(end) + +L(top): mov (up), %rax + mov 8(up), %r9 + lea 32(rp), rp + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + mov %rax, -24(rp) + mov %r9, -16(rp) + sub $4, n + mov %r10, -8(rp) + mov %r11, (rp) + jnc L(top) + +L(end): shr R32(n) + jnc 1f + mov (up), %rax + mov %rax, 8(rp) + lea 8(rp), rp + lea 8(up), up +1: shr R32(n) + jnc 1f + mov (up), %rax + mov 8(up), %r9 + mov %rax, 8(rp) + mov %r9, 16(rp) +1: ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm new file mode 100644 index 0000000..7066bb4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh1_n.asm @@ -0,0 +1,53 @@ +dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) +dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm new file mode 100644 index 0000000..5065120 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh2_n.asm @@ -0,0 +1,53 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh2_n)') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm new file mode 100644 index 0000000..57abf31 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/coreinhm/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm new file mode 100644 index 0000000..3f875ae --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aors_err1_n.asm @@ -0,0 +1,225 @@ +dnl Core 2 mpn_add_err1_n, mpn_sub_err1_n + +dnl Contributed by David Harvey. + +dnl Copyright 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 4.14 +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`ep', `%rcx') +define(`yp', `%r8') +define(`n', `%r9') +define(`cy_param', `8(%rsp)') + +define(`el', `%rbx') +define(`eh', `%rbp') +define(`t0', `%r10') +define(`t1', `%r11') +define(`t2', `%r12') +define(`t3', `%r13') +define(`w0', `%r14') +define(`w1', `%r15') + +ifdef(`OPERATION_add_err1_n', ` + define(ADCSBB, adc) + define(func, mpn_add_err1_n)') +ifdef(`OPERATION_sub_err1_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_err1_n)') + +MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n) + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + mov cy_param, %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + + mov R32(n), R32(%r10) + and $3, R32(%r10) + jz L(0mod4) + cmp $2, R32(%r10) + jc L(1mod4) + jz L(2mod4) +L(3mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + xor R32(t1), R32(t1) + lea -24(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 16(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc 8(yp), t0 + mov 16(up,n,8), w0 + ADCSBB 16(vp,n,8), w0 + mov w0, 16(rp,n,8) + cmovc (yp), t1 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + + add $3, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(0mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea (yp,n,8), yp + neg n + jmp L(loop) + + ALIGN(16) +L(1mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + lea -8(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc (yp), el + setc %al C save carry + + add $1, n + jnz L(loop) + jmp L(end) + + ALIGN(16) +L(2mod4): + xor R32(el), R32(el) + xor R32(eh), R32(eh) + xor R32(t0), R32(t0) + lea -16(yp,n,8), yp + neg n + + shr $1, %al C restore carry + mov (up,n,8), w0 + mov 8(up,n,8), w1 + ADCSBB (vp,n,8), w0 + mov w0, (rp,n,8) + cmovc 8(yp), el + ADCSBB 8(vp,n,8), w1 + mov w1, 8(rp,n,8) + cmovc (yp), t0 + setc %al C save carry + add t0, el + adc $0, eh + + add $2, n + jnz L(loop) + jmp L(end) + + ALIGN(32) +L(loop): + mov (up,n,8), w0 + shr $1, %al C restore carry + mov -8(yp), t0 + mov $0, R32(t3) + ADCSBB (vp,n,8), w0 + cmovnc t3, t0 + mov w0, (rp,n,8) + mov 8(up,n,8), w1 + mov 16(up,n,8), w0 + ADCSBB 8(vp,n,8), w1 + mov -16(yp), t1 + cmovnc t3, t1 + mov -24(yp), t2 + mov w1, 8(rp,n,8) + ADCSBB 16(vp,n,8), w0 + cmovnc t3, t2 + mov 24(up,n,8), w1 + ADCSBB 24(vp,n,8), w1 + cmovc -32(yp), t3 + setc %al C save carry + add t0, el + adc $0, eh + add t1, el + adc $0, eh + add t2, el + adc $0, eh + lea -32(yp), yp + mov w0, 16(rp,n,8) + add t3, el + adc $0, eh + add $4, n + mov w1, -8(rp,n,8) + jnz L(loop) + +L(end): + mov el, (ep) + mov eh, 8(ep) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm new file mode 100644 index 0000000..f9e0039 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aors_n.asm @@ -0,0 +1,150 @@ +dnl Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem. + +dnl Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 1.93\2 +C AMD bull 1.62\2.1 +C AMD pile 1.6\1.7 +C AMD steam +C AMD excavator +C AMD bobcat 2.79 +C AMD jaguar 2.54 +C Intel P4 10 +C Intel core2 2 +C Intel NHM 2 +C Intel SBR 2 +C Intel IBR 1.95 +C Intel HWL 1.72 +C Intel BWL 1.54 +C Intel SKL 1.52 +C Intel atom 9 +C Intel SLM 6.5 +C VIA nano 3 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +L(start): + mov (up), %r10 + mov (vp), %r11 + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + mov R32(n), R32(%rax) + neg n + and $3, R32(%rax) + je L(b00) + add %rax, n C clear low rcx bits for jrcxz + cmp $2, R32(%rax) + jl L(b01) + je L(b10) + +L(b11): neg %r8 C set cy + jmp L(e11) + +L(b00): neg %r8 C set cy + mov %r10, %r8 + mov %r11, %r9 + lea 4(n), n + jmp L(e00) + + nop + nop + nop +L(b01): neg %r8 C set cy + jmp L(top) + +L(b10): neg %r8 C set cy + mov %r10, %r8 + mov %r11, %r9 + jmp L(e10) + +L(end): ADCSBB %r11, %r10 + mov %r10, -8(rp) + mov R32(%rcx), R32(%rax) C clear eax, ecx contains 0 + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(top): jrcxz L(end) + mov (up,n,8), %r8 + mov (vp,n,8), %r9 + lea 4(n), n + ADCSBB %r11, %r10 + mov %r10, -40(rp,n,8) +L(e00): mov -24(up,n,8), %r10 + mov -24(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -32(rp,n,8) +L(e11): mov -16(up,n,8), %r8 + mov -16(vp,n,8), %r9 + ADCSBB %r11, %r10 + mov %r10, -24(rp,n,8) +L(e10): mov -8(up,n,8), %r10 + mov -8(vp,n,8), %r11 + ADCSBB %r9, %r8 + mov %r8, -16(rp,n,8) + jmp L(top) +EPILOGUE() + +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(start) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm new file mode 100644 index 0000000..a7a5d6e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/aorsmul_1.asm @@ -0,0 +1,188 @@ +dnl x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2". + +dnl Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.52 +C AMD K10 4.01 +C AMD bull 4.98 +C AMD pile 4.83 +C AMD steam +C AMD excavator +C AMD bobcat 5.56 +C AMD jaguar 5.54 +C Intel P4 16.3 17.3 +C Intel core2 4.32 4.61 +C Intel NHM 5.08 +C Intel SBR 4.04 +C Intel IBR 3.95 +C Intel HWL 3.66 +C Intel BWL 2.87 +C Intel SKL 2.79 +C Intel atom 20.6 +C Intel SLM 7.6 +C VIA nano 5.25 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`v0', `%rcx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') + define(`func_1c', `mpn_addmul_1c') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') + define(`func_1c', `mpn_submul_1c') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + + C For DOS, on the stack we have four saved registers, return address, + C space for four register arguments, and finally the carry input. + +IFDOS(` define(`carry_in', `72(%rsp)')') dnl +IFSTD(` define(`carry_in', `%r8')') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_1c) + FUNC_ENTRY(4) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + add carry_in, %rax + adc $0, %rdx + jmp L(start_nc) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %rbp + lea (%rdx), %rbx + neg %rbx + + mov (up), %rax + mov (rp), %r10 + + lea -16(rp,%rdx,8), rp + lea (up,%rdx,8), up + mul %rcx + +L(start_nc): + test $1, R8(%rbx) + jnz L(odd) + + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + mul %rcx + add $2, %rbx + jz L(n2) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + jmp L(mid) + + ALIGN(8) +L(odd): inc %rbx + jz L(n1) + + lea (%rax), %r8 + mov (up,%rbx,8), %rax + lea (%rdx), %r9 + mul %rcx + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + lea (%rdx), %rbp + jmp L(e) + + ALIGN(16) +L(top): mul %rcx + ADDSUB %r8, %r10 + lea (%rax), %r8 + mov (up,%rbx,8), %rax + adc %r9, %r11 + mov %r10, -8(rp,%rbx,8) + mov (rp,%rbx,8), %r10 + lea (%rdx), %r9 + adc $0, %rbp +L(mid): mul %rcx + ADDSUB %r11, %r10 + lea (%rax), %r11 + mov 8(up,%rbx,8), %rax + adc %rbp, %r8 + mov %r10, (rp,%rbx,8) + mov 8(rp,%rbx,8), %r10 + lea (%rdx), %rbp + adc $0, %r9 +L(e): add $2, %rbx + js L(top) + + mul %rcx + ADDSUB %r8, %r10 + adc %r9, %r11 + mov %r10, -8(rp) + adc %rbx, %rbp C rbx = 0 +L(n2): mov (rp), %r10 + ADDSUB %r11, %r10 + adc %rbp, %rax + mov %r10, (rp) + adc %rbx, %rdx C rbx = 0 +L(n1): mov 8(rp), %r10 + ADDSUB %rax, %r10 + mov %r10, 8(rp) + mov R32(%rbx), R32(%rax) C rbx = 0 + adc %rdx, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/com.asm b/gmp-6.3.0/mpn/x86_64/core2/com.asm new file mode 100644 index 0000000..d7d9f79 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyd.asm b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm new file mode 100644 index 0000000..57ea0e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/copyi.asm b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm new file mode 100644 index 0000000..f0c7607 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm new file mode 100644 index 0000000..1b3f139 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/divrem_1.asm @@ -0,0 +1,243 @@ +dnl x86-64 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 2004, 2005, 2007-2010, 2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C norm unorm frac +C AMD K8,K9 15 15 12 +C AMD K10 15 15 12 +C Intel P4 44 44 43 +C Intel core2 24 24 19.5 +C Intel corei 19 19 18 +C Intel atom 51 51 36 +C VIA nano 46 44 22.5 + +C mp_limb_t +C mpn_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d) + +C mp_limb_t +C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d, +C mp_limb_t dinv, int cnt) + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`fn_param', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') +define(`d', `%r8') +define(`dinv', `%r9') C only for mpn_preinv_divrem_1 +C shift passed on stack C only for mpn_preinv_divrem_1 + +define(`cnt', `%rcx') +define(`up', `%rsi') +define(`fn', `%r12') +define(`un', `%rbx') + + +C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C cnt qp d dinv + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFSTD(`define(`CNTOFF', `40($1)')') +IFDOS(`define(`CNTOFF', `104($1)')') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + + lea -8(qp,un_param,8), qp + + mov CNTOFF(%rsp), R8(cnt) + shl R8(cnt), d + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + je L(ret) + + lea -8(qp,un_param,8), qp + xor R32(%rbp), R32(%rbp) + +L(unnormalized): + test un, un + je L(44) + mov -8(up,un,8), %rax + cmp d, %rax + jae L(44) + mov %rbp, (qp) + mov %rax, %rbp + lea -8(qp), qp + je L(ret) + dec un +L(44): + bsr d, %rcx + not R32(%rcx) + sal R8(%rcx), d + sal R8(%rcx), %rbp + + push %rcx +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %r8 +IFSTD(` sub $8, %rsp ') +IFSTD(` mov d, %rdi ') +IFDOS(` sub $40, %rsp ') +IFDOS(` mov d, %rcx ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + pop %r8 +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') + pop %rcx + + mov %rax, dinv + mov %rbp, %rax + test un, un + je L(frac) + +L(ent): mov -8(up,un,8), %rbp + shr R8(%rcx), %rax + shld R8(%rcx), %rbp, %rax + sub $2, un + js L(end) + + ALIGN(16) +L(top): lea 1(%rax), %r11 + mul dinv + mov (up,un,8), %r10 + shld R8(%rcx), %r10, %rbp + mov %rbp, %r13 + add %rax, %r13 + adc %r11, %rdx + mov %rdx, %r11 + imul d, %rdx + sub %rdx, %rbp + lea (d,%rbp), %rax + sub $8, qp + cmp %r13, %rbp + cmovc %rbp, %rax + adc $-1, %r11 + cmp d, %rax + jae L(ufx) +L(uok): dec un + mov %r11, 8(qp) + mov %r10, %rbp + jns L(top) + +L(end): lea 1(%rax), %r11 + sal R8(%rcx), %rbp + mul dinv + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul d, %rdx + sub %rdx, %rbp + mov d, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp d, %rax + jae L(efx) +L(eok): mov %r13, (qp) + sub $8, qp + jmp L(frac) + +L(ufx): sub d, %rax + inc %r11 + jmp L(uok) +L(efx): sub d, %rax + inc %r13 + jmp L(eok) + +L(frac):mov d, %rbp + neg %rbp + jmp L(fent) + + ALIGN(16) C K8-K10 P6-CNR P6-NHM P4 +L(ftop):mul dinv C 0,12 0,17 0,17 + add %r11, %rdx C 5 8 10 + mov %rax, %r11 C 4 8 3 + mov %rdx, %r13 C 6 9 11 + imul %rbp, %rdx C 6 9 11 + mov d, %rax C + add %rdx, %rax C 10 14 14 + cmp %r11, %rdx C 10 14 14 + cmovc %rdx, %rax C 11 15 15 + adc $-1, %r13 C + mov %r13, (qp) C + sub $8, qp C +L(fent):lea 1(%rax), %r11 C + dec fn C + jns L(ftop) C + + shr R8(%rcx), %rax +L(ret): pop %rbx + pop %rbp + pop %r12 + pop %r13 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm new file mode 100644 index 0000000..b00451f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_11.asm @@ -0,0 +1,93 @@ +dnl AMD64 mpn_gcd_11 optimised for Intel CNR, PNR, SBR, IBR. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR 4.22 * +C Intel PNR 4.22 * +C Intel NHM 4.97 +C Intel WSM 5.17 +C Intel SBR 4.83 * +C Intel IBR 4.16 * +C Intel HWL 3.84 +C Intel BWL 3.76 +C Intel SKL 3.83 +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + jmp L(odd) + + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 +L(odd): mov v0, %rdx + sub u0, %rdx C v - u + bsf %rdx, %rcx + mov u0, %rax + sub v0, u0 C u - v + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm new file mode 100644 index 0000000..b5aa73b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gcd_22.asm @@ -0,0 +1,137 @@ +dnl AMD64 mpn_gcd_22. Assumes useful bsf, useful shrd, no tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR 8.7 +C Intel PNR 8.7 +C Intel NHM 9.2 +C Intel WSM 9.2 +C Intel SBR 9.1 +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + bsf t0, cnt + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + shrd R8(cnt), u1, u0 + shr R8(cnt), u1 + + mov v1, t1 + or u1, t1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + bsf t0, cnt + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h new file mode 100644 index 0000000..44f1494 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/gmp-mparam.h @@ -0,0 +1,222 @@ +/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3000 MHz Penryn */ +/* FFT tuning limit = 116,220,984 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 16 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 26 + +#define DIV_1_VS_MUL_1_PERCENT 284 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 184 +#define MUL_TOOM6H_THRESHOLD 256 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 79 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 102 +#define SQR_TOOM4_THRESHOLD 160 +#define SQR_TOOM6_THRESHOLD 366 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 368, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 19, 7}, { 39, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 79,11}, { 47,10}, { 95,12}, { 31, 9}, \ + { 255,10}, { 135,11}, { 79,10}, { 159, 9}, \ + { 319,11}, { 95,10}, { 191, 9}, { 383,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,13}, { 63,12}, \ + { 127,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 575,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 799,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 703,14}, { 383,13}, { 831,12}, \ + { 1663,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2559,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4991,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 176 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 308, 5}, { 17, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,11}, { 79,10}, \ + { 159, 6}, { 2559, 7}, { 1343, 6}, { 2687, 7}, \ + { 1407, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319,11}, { 175,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,12}, { 223,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 575,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 799,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 639,12}, \ + { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1663,14}, { 895,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,16}, { 511,15}, { 1023,14}, { 2303,13}, \ + { 4607,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,12}, { 11775,15}, { 1535,14}, \ + { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,13}, \ + { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 183 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 67 +#define MULLO_MUL_N_THRESHOLD 9174 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 11 +#define SQRLO_SQR_THRESHOLD 7035 + +#define DC_DIV_QR_THRESHOLD 53 +#define DC_DIVAPPR_Q_THRESHOLD 163 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 158 +#define INV_APPR_THRESHOLD 167 + +#define BINV_NEWTON_THRESHOLD 248 +#define REDC_1_TO_REDC_N_THRESHOLD 44 + +#define MU_DIV_QR_THRESHOLD 1187 +#define MU_DIVAPPR_Q_THRESHOLD 1210 +#define MUPI_DIV_QR_THRESHOLD 73 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define POWM_SEC_TABLE 1,64,105,579,1486 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 134 +#define SET_STR_PRECOMPUTE_THRESHOLD 1752 + +#define FAC_DSC_THRESHOLD 351 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 18 +#define HGCD2_DIV1_METHOD 3 /* 2.14% faster than 5 */ +#define HGCD_THRESHOLD 118 +#define HGCD_APPR_THRESHOLD 161 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 351 +#define JACOBI_BASE_METHOD 4 /* 3.56% faster than 1 */ + +/* Tuneup completed successfully, took 132491 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm new file mode 100644 index 0000000..ded7b67 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm @@ -0,0 +1,210 @@ +dnl AMD64 SSSE3 mpn_hamdist -- hamming distance. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 n/a +C Intel CNR 4.50 y +C Intel PNR 3.28 y +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C VIA nano ? + +C TODO +C * This was hand-written without too much thought about optimal insn +C selection; check to see of it can be improved. +C * Consider doing some instruction scheduling. + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', + `define(`OFF1',64) define(`OFF2',80)') + movdqa OFF1`'(%r9), %xmm7 + movdqa OFF2`'(%r9), %xmm6 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm8, %xmm8 + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): movq (up), %xmm1 + add $8, up + movq (vp), %xmm10 + add $8, vp + pxor %xmm10, %xmm1 + jmp L(e1) + +L(2): add $-48, up + add $-48, vp + jmp L(e2) + +L(3): movq (up), %xmm1 + add $-40, up + movq (vp), %xmm10 + add $-40, vp + pxor %xmm10, %xmm1 + jmp L(e3) + +L(4): add $-32, up + add $-32, vp + jmp L(e4) + +L(5): movq (up), %xmm1 + add $-24, up + movq (vp), %xmm10 + add $-24, vp + pxor %xmm10, %xmm1 + jmp L(e5) + +L(6): add $-16, up + add $-16, vp + jmp L(e6) + +L(7): movq (up), %xmm1 + add $-8, up + movq (vp), %xmm10 + add $-8, vp + pxor %xmm10, %xmm1 + jmp L(e7) + + ALIGN(32) +L(top): lddqu (up), %xmm1 + lddqu (vp), %xmm10 + pxor %xmm10, %xmm1 +L(e7): movdqa %xmm6, %xmm0 C copy mask register + movdqa %xmm7, %xmm2 C copy count register + movdqa %xmm7, %xmm3 C copy count register + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm1 + lddqu 16(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e5): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm1 + lddqu 32(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e3): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e2): lddqu 48(up), %xmm1 + add $64, up + lddqu 48(vp), %xmm10 + add $64, vp + pxor %xmm10, %xmm1 +L(e1): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts + paddb %xmm2, %xmm3 + paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts + movdqa %xmm3, %xmm4 + sub $8, n + jg L(top) + + psadbw %xmm5, %xmm4 + paddq %xmm4, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) diff --git a/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm new file mode 100644 index 0000000..5ff174c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/logops_n.asm @@ -0,0 +1,285 @@ +dnl AMD64 logops. + +dnl Copyright 2004-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l c/l c/l good +C var-1 var-2 var-3 for cpu? +C AMD K8,K9 +C AMD K10 1.52 1.75 1.75 n +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD bt1 2.67 ~2.79 ~2.79 = +C AMD bt2 2.15 2.65 2.65 n +C AMD zen 1.5 1.5 1.5 = +C Intel P4 +C Intel PNR 2.0 2.0 2.0 = +C Intel NHM 2.0 2.0 2.0 = +C Intel SBR 1.5 1.5 1.5 y +C Intel IBR 1.47 1.48 1.48 y +C Intel HWL 1.11 1.35 1.35 y +C Intel BWL 1.09 1.30 1.30 y +C Intel SKL 1.21 1.27 1.27 y +C Intel atom 3.31 3.57 3.57 y +C Intel SLM 3.0 3.0 3.0 = +C VIA nano + +ifdef(`OPERATION_and_n',` + define(`func',`mpn_and_n') + define(`VARIANT_1') + define(`LOGOP',`and')') +ifdef(`OPERATION_andn_n',` + define(`func',`mpn_andn_n') + define(`VARIANT_2') + define(`LOGOP',`and')') +ifdef(`OPERATION_nand_n',` + define(`func',`mpn_nand_n') + define(`VARIANT_3') + define(`LOGOP',`and')') +ifdef(`OPERATION_ior_n',` + define(`func',`mpn_ior_n') + define(`VARIANT_1') + define(`LOGOP',`or')') +ifdef(`OPERATION_iorn_n',` + define(`func',`mpn_iorn_n') + define(`VARIANT_2') + define(`LOGOP',`or')') +ifdef(`OPERATION_nior_n',` + define(`func',`mpn_nior_n') + define(`VARIANT_3') + define(`LOGOP',`or')') +ifdef(`OPERATION_xor_n',` + define(`func',`mpn_xor_n') + define(`VARIANT_1') + define(`LOGOP',`xor')') +ifdef(`OPERATION_xnor_n',` + define(`func',`mpn_xnor_n') + define(`VARIANT_2') + define(`LOGOP',`xor')') + +define(`addptr', `lea $1($2), $2') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + +ifdef(`VARIANT_1',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 +L(b00): mov 8(vp), %r9 + LOGOP (up), %r8 + LOGOP 8(up), %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 +L(e10): mov 24(vp), %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_2',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + not %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 + not %r8 +L(b00): mov 8(vp), %r9 + not %r9 + LOGOP (up), %r8 + LOGOP 8(up), %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 + not %r8 +L(e10): mov 24(vp), %r9 + not %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_3',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + not %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + not %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 +L(b00): mov 8(vp), %r9 + LOGOP (up), %r8 + not %r8 + LOGOP 8(up), %r9 + not %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 +L(e10): mov 24(vp), %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + not %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + not %r9 + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshift.asm b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm new file mode 100644 index 0000000..9016a71 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/lshift.asm @@ -0,0 +1,145 @@ +dnl x86-64 mpn_lshift optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.32 +C Intel NHM 1.30 (drops to 2.5 for n > 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea -8(up,n,8), up + lea 16(rp,n,8), rp + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %rax + mov -16(up), %r8 + shr $2, n + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea -16(up,n,8), up + lea 8(rp,n,8), rp + mov 8(up), %r9 + shld R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov (up), %r10 + mov -8(up), %r11 + jmp L(01) + +L(b10): lea -24(up,n,8), up + lea (rp,n,8), rp + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov (up), %r10 + jmp L(10) + + ALIGN(16) +L(b11): lea -32(up,n,8), up + lea -8(rp,n,8), rp + mov 24(up), %r11 + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shld R8(cnt), %r8, %r11 + mov (up), %r10 + mov %r11, (rp) +L(10): shld R8(cnt), %r9, %r8 + mov -8(up), %r11 + mov %r8, -8(rp) +L(01): shld R8(cnt), %r10, %r9 + mov -16(up), %r8 + mov %r9, -16(rp) +L(00): shld R8(cnt), %r11, %r10 + mov -24(up), %r9 + add $-32, up + mov %r10, -24(rp) + add $-32, rp + dec n + jnz L(top) + +L(end): shld R8(cnt), %r8, %r11 + mov %r11, (rp) +L(2): shld R8(cnt), %r9, %r8 + mov %r8, -8(rp) +L(1): shl R8(cnt), %r9 + mov %r9, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm new file mode 100644 index 0000000..c428f13 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/lshiftc.asm @@ -0,0 +1,159 @@ +dnl x86-64 mpn_lshiftc optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.52 +C Intel NHM 1.78 (just 2.15 for n < 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +C TODO +C * This runs poorly on Nehalem compared to plain lshift, in particular for +C n < 256. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea -8(up,n,8), up + lea 16(rp,n,8), rp + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %rax + mov -16(up), %r8 + shr $2, n + shld R8(cnt), %r11, %r10 + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea -16(up,n,8), up + lea 8(rp,n,8), rp + mov 8(up), %r9 + shld R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov (up), %r10 + mov -8(up), %r11 + shld R8(cnt), %r10, %r9 + jmp L(01) + +L(b10): lea -24(up,n,8), up + lea (rp,n,8), rp + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov (up), %r10 + shld R8(cnt), %r9, %r8 + jmp L(10) + + ALIGN(16) +L(b11): lea -32(up,n,8), up + lea -8(rp,n,8), rp + mov 24(up), %r11 + mov 16(up), %r8 + mov 8(up), %r9 + shld R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shld R8(cnt), %r8, %r11 + mov (up), %r10 + not %r11 + shld R8(cnt), %r9, %r8 + mov %r11, (rp) +L(10): mov -8(up), %r11 + not %r8 + shld R8(cnt), %r10, %r9 + mov %r8, -8(rp) +L(01): mov -16(up), %r8 + not %r9 + shld R8(cnt), %r11, %r10 + mov %r9, -16(rp) +L(00): mov -24(up), %r9 + not %r10 + add $-32, up + mov %r10, -24(rp) + add $-32, rp + dec n + jnz L(top) + +L(end): shld R8(cnt), %r8, %r11 + not %r11 + mov %r11, (rp) +L(2): shld R8(cnt), %r9, %r8 + not %r8 + mov %r8, -8(rp) +L(1): shl R8(cnt), %r9 + not %r9 + mov %r9, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm new file mode 100644 index 0000000..d16be85 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/mul_basecase.asm @@ -0,0 +1,975 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere. +dnl It also seems good for Conroe/Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.0 4.0 - 4.18-4.25 +C Intel NHM 3.75 3.8 - 4.06-4.2 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C Code structure: +C +C +C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4) +C | | | | +C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) | +C | / | / | / | / +C | / | / | / | / +C | / | / | / | / +C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_ +C _____ _____ _____ _____ +C / \ / \ / \ / \ +C \|/ | \|/ | \|/ | \|/ | +C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) | +C \ /|\ \ /|\ \ /|\ \ /|\ +C \_____/ \_____/ \_____/ \_____/ + +C TODO +C * Tune. None done so far. +C * Currently 2687 bytes, making it smaller would be nice. +C * Implement some basecases, say for un < 4. +C * Try zeroing with xor in m2 loops. +C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication +C between loop header and wind-down code. +C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +C Define this to $1 to use late loop index variable as zero, $2 to use an +C explicit $0. +define(`Z',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance! +define(`vn_param', `%r8') + +define(`un', `%r9') +define(`vn', `(%rsp)') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r12') +define(`i', `%r13') +define(`vp', `%r14') + +define(`X0', `%r8') +define(`X1', `%r15') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + mov (up), %rax C shared for mul_1 and mul_2 + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov (vp_param), v0 C shared for mul_1 and mul_2 + + xor un, un + sub un_param, un C un = -un_param + + lea (up,un_param,8), up + lea (rp,un_param,8), rp + + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn_param) + jz L(m2) + + lea 8(vp_param), vp C FIXME: delay until known needed + + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):test $2, R8(un) + jnz L(m1s2) + +L(m1s0): + lea (un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + lea L(do_am0)(%rip), %rbp + jmp L(m1e0) + +L(m1s2): + lea 2(un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + mul v0 + lea L(do_am2)(%rip), %rbp + test i, i + jnz L(m1e2) + add %rax, w0 + adc $0, %rdx + mov w0, I(-8(rp),8(rp,un,8)) + mov %rdx, I((rp),16(rp,un,8)) + jmp L(ret2) + +L(m1x1):test $2, R8(un) + jz L(m1s3) + +L(m1s1): + lea 1(un), i + mov %rax, (rp,un,8) + test i, i + jz L(1) + mov 8(up,un,8), %rax + mov %rdx, w1 C FIXME: Use lea? + lea L(do_am1)(%rip), %rbp + jmp L(m1e1) +L(1): mov %rdx, I((rp),8(rp,un,8)) + jmp L(ret2) + +L(m1s3): + lea -1(un), i + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + mov %rdx, w1 C FIXME: Use lea? + lea L(do_am3)(%rip), %rbp + jmp L(m1e3) + + ALIGNx +L(m1top): + mul v0 + mov w1, -16(rp,i,8) +L(m1e2):xor R32(w1), R32(w1) + add %rax, w0 + mov (up,i,8), %rax + adc %rdx, w1 + mov w0, -8(rp,i,8) +L(m1e1):xor R32(w0), R32(w0) + mul v0 + add %rax, w1 + mov 8(up,i,8), %rax + adc %rdx, w0 + mov w1, (rp,i,8) +L(m1e0):xor R32(w1), R32(w1) + mul v0 + add %rax, w0 + mov 16(up,i,8), %rax + adc %rdx, w1 + mov w0, 8(rp,i,8) +L(m1e3):xor R32(w0), R32(w0) + mul v0 + add %rax, w1 + mov 24(up,i,8), %rax + adc %rdx, w0 + add $4, i + js L(m1top) + + mul v0 + mov w1, I(-16(rp),-16(rp,i,8)) + add %rax, w0 + adc $0, %rdx + mov w0, I(-8(rp),-8(rp,i,8)) + mov %rdx, I((rp),(rp,i,8)) + + dec vn_param + jz L(ret2) + lea -8(rp), rp + jmp *%rbp + +L(m2): + mov 8(vp_param), v1 + lea 16(vp_param), vp C FIXME: delay until known needed + + test $1, R8(un) + jnz L(bx1) + +L(bx0): test $2, R8(un) + jnz L(b10) + +L(b00): lea (un), i + mov %rax, (rp,un,8) + mov %rdx, w1 C FIXME: Use lea? + mov (up,un,8), %rax + mov $0, R32(w2) + jmp L(m2e0) + +L(b10): lea -2(un), i + mov %rax, w2 C FIXME: Use lea? + mov (up,un,8), %rax + mov %rdx, w3 C FIXME: Use lea? + mov $0, R32(w0) + jmp L(m2e2) + +L(bx1): test $2, R8(un) + jz L(b11) + +L(b01): lea 1(un), i + mov %rax, (rp,un,8) + mov (up,un,8), %rax + mov %rdx, w0 C FIXME: Use lea? + mov $0, R32(w1) + jmp L(m2e1) + +L(b11): lea -1(un), i + mov %rax, w1 C FIXME: Use lea? + mov (up,un,8), %rax + mov %rdx, w2 C FIXME: Use lea? + mov $0, R32(w3) + jmp L(m2e3) + + ALIGNx +L(m2top0): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) +L(m2e0):mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top0) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am0): + push %r15 + push vn_param + +L(olo0): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax +C lea 0(un), i + mov un, i + mul v0 + mov %rax, X0 + mov (up,un,8), %rax + MOV( %rdx, X1, 2) + mul v1 + MOV( %rdx, w0, 4) + mov (rp,un,8), w2 + mov %rax, w3 + jmp L(lo0) + + ALIGNx +L(am2top0): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top0) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo0) + +L(ret): pop %rax + pop %r15 +L(ret2):pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top1): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) +L(m2e1):mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top1) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am1): + push %r15 + push vn_param + +L(olo1): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea 1(un), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 128) + mov (up,un,8), %rax + mov (rp,un,8), w1 + mul v1 + mov %rax, w2 + mov 8(up,un,8), %rax + MOV( %rdx, w3, 1) + jmp L(lo1) + + ALIGNx +L(am2top1): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo1): mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top1) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo1) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top2): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(m2e2):mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top2) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am2): + push %r15 + push vn_param + +L(olo2): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea -2(un), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov (up,un,8), %rax + mov (rp,un,8), w0 + mul v1 + mov %rax, w1 + lea (%rdx), w2 + mov 8(up,un,8), %rax + jmp L(lo2) + + ALIGNx +L(am2top2): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 +L(lo2): mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top2) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo2) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + + ALIGNx +L(m2top3): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax +L(m2e3):mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top3) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, I((rp),(rp,i,8)) + mov w1, I(8(rp),8(rp,i,8)) + + add $-2, vn_param + jz L(ret2) + +L(do_am3): + push %r15 + push vn_param + +L(olo3): + mov (vp), v0 + mov 8(vp), v1 + lea 16(vp), vp + lea 16(rp), rp + mov (up,un,8), %rax + lea -1(un), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 8) + mov (up,un,8), %rax + mov (rp,un,8), w3 + mul v1 + mov %rax, w0 + MOV( %rdx, w1, 16) + mov 8(up,un,8), %rax + jmp L(lo3) + + ALIGNx +L(am2top3): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax +L(lo3): mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top3) + + mul v1 + add w0, w1 + adc %rax, w2 + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add w2, X0 + mov X0, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + addl $-2, vn + jnz L(olo3) + + pop %rax + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm new file mode 100644 index 0000000..0f03d86 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/mullo_basecase.asm @@ -0,0 +1,427 @@ +dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.0 4.18-4.25 +C Intel NHM 3.75 4.06-4.2 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Implement proper cor2, replacing current cor0. +C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) +C * Micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n_param', `%rcx') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r12') +define(`n', `%r9') +define(`i', `%r13') +define(`vp', `%r8') + +define(`X0', `%r14') +define(`X1', `%r15') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + + mov (up), %rax + mov vp_param, vp + + cmp $4, n_param + jb L(small) + + mov (vp_param), v0 + push %rbx + lea (rp,n_param,8), rp C point rp at R[un] + push %rbp + lea (up,n_param,8), up C point up right after U's end + push %r12 + mov $0, R32(n) C FIXME + sub n_param, n + push %r13 + mul v0 + mov 8(vp), v1 + + test $1, R8(n_param) + jnz L(m2x1) + +L(m2x0):test $2, R8(n_param) + jnz L(m2b2) + +L(m2b0):lea (n), i + mov %rax, (rp,n,8) + mov %rdx, w1 + mov (up,n,8), %rax + xor R32(w2), R32(w2) + jmp L(m2e0) + +L(m2b2):lea -2(n), i + mov %rax, w2 + mov (up,n,8), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + jmp L(m2e2) + +L(m2x1):test $2, R8(n_param) + jnz L(m2b3) + +L(m2b1):lea 1(n), i + mov %rax, (rp,n,8) + mov (up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + jmp L(m2e1) + +L(m2b3):lea -1(n), i + xor R32(w3), R32(w3) + mov %rax, w1 + mov %rdx, w2 + mov (up,n,8), %rax + jmp L(m2e3) + + ALIGNx +L(m2tp):mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) +L(m2e1):mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) +L(m2e0):mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax +L(m2e3):mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(m2e2):mul v1 + mov $0, R32(w1) C FIXME: dead in last iteration + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 C FIXME: dead in last iteration + add $4, i + js L(m2tp) + +L(m2ed):imul v0, %rax + add w3, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jge L(cor1) + + push %r14 + push %r15 + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,n,8), %rax + mul v0 + test $1, R8(n) + jnz L(a1x1) + +L(a1x0):mov %rax, X1 + MOV( %rdx, X0, 8) + mov (up,n,8), %rax + mul v1 + test $2, R8(n) + jnz L(a110) + +L(a100):lea (n), i + mov (rp,n,8), w3 + mov %rax, w0 + MOV( %rdx, w1, 16) + jmp L(lo0) + +L(a110):lea 2(n), i + mov (rp,n,8), w1 + mov %rax, w2 + mov 8(up,n,8), %rax + MOV( %rdx, w3, 1) + jmp L(lo2) + +L(a1x1):mov %rax, X0 + MOV( %rdx, X1, 2) + mov (up,n,8), %rax + mul v1 + test $2, R8(n) + jz L(a111) + +L(a101):lea 1(n), i + MOV( %rdx, w0, 4) + mov (rp,n,8), w2 + mov %rax, w3 + jmp L(lo1) + +L(a111):lea -1(n), i + MOV( %rdx, w2, 64) + mov %rax, w1 + mov (rp,n,8), w0 + mov 8(up,n,8), %rax + jmp L(lo3) + + ALIGNx +L(top): mul v1 + add w0, w1 + adc %rax, w2 + mov -8(up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov -8(up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov -8(rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, -8(rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov (up,i,8), %rax + mov (rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, (rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 8(rp,i,8), w3 + adc $0, X1 + mov 8(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 16(up,i,8), %rax + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 16(up,i,8), %rax + mov 16(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(top) + +L(end): imul v1, %rax + add w0, w1 + adc %rax, w2 + mov I(-8(up),-8(up,i,8)), %rax + imul v0, %rax + add w1, X1 + mov X1, I(-16(rp),-16(rp,i,8)) + adc X0, %rax + mov I(-8(rp),-8(rp,i,8)), w1 + add w1, w2 + add w2, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jl L(outer) + + pop %r15 + pop %r14 + + jnz L(cor0) + +L(cor1):mov (vp), v0 + mov 8(vp), v1 + mov -16(up), %rax + mul v0 C u0 x v2 + add -16(rp), %rax C FIXME: rp[0] still available in reg? + adc -8(rp), %rdx C FIXME: rp[1] still available in reg? + mov -8(up), %rbx + imul v0, %rbx + mov -16(up), %rcx + imul v1, %rcx + mov %rax, -16(rp) + add %rbx, %rcx + add %rdx, %rcx + mov %rcx, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(cor0):mov (vp), %r11 + imul -8(up), %r11 + add %rax, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + cmp $2, n_param + jae L(gt1) +L(n1): imul (vp_param), %rax + mov %rax, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp_param), %r9 + mul %r9 + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp_param), %r9 + mul %r9 C u0 x v0 + mov %rax, (rp) + mov %rdx, %r10 + mov 8(up), %rax + mul %r9 C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r11 + mov (up), %rax + mul %r11 C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r11 C u1 x v1 + add %r11, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/popcount.asm b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm new file mode 100644 index 0000000..3de69d8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/popcount.asm @@ -0,0 +1,185 @@ +dnl AMD64 SSSE3 mpn_popcount -- population count. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.79-1.91 n +C AMD bd2 1.73-1.85 n +C AMD bd3 ? +C AMD bd4 1.73-1.85 n +C AMD zen 1.47 n +C AMD bobcat 8.0 n +C AMD jaguar 4.78 n +C Intel P4 n/a +C Intel CNR 3.75 +C Intel PNR 2.61 y +C Intel NHM 2.03 n +C Intel SBR 1.87 n +C Intel IBR 1.52-1.58 n +C Intel HWL 1.52-1.58 n +C Intel BWL 1.52-1.58 n +C Intel SKL 1.51 n +C Intel atom 12.3 n +C Intel SLM 9.1 n +C VIA nano ? + +C TODO +C * This was hand-written without too much thought about optimal insn +C selection; check to see of it can be improved. +C * Consider doing some instruction scheduling. + +define(`up', `%rdi') +define(`n', `%rsi') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', + `define(`OFF1',64) define(`OFF2',80)') + movdqa OFF1`'(%r9), %xmm7 + movdqa OFF2`'(%r9), %xmm6 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm8, %xmm8 + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): movq (up), %xmm1 + add $8, up + jmp L(e1) + +L(2): add $-48, up + jmp L(e2) + +L(3): movq (up), %xmm1 + add $-40, up + jmp L(e3) + +L(4): add $-32, up + jmp L(e4) + +L(5): movq (up), %xmm1 + add $-24, up + jmp L(e5) + +L(6): add $-16, up + jmp L(e6) + +L(7): movq (up), %xmm1 + add $-8, up + jmp L(e7) + + ALIGN(32) +L(top): lddqu (up), %xmm1 +L(e7): movdqa %xmm6, %xmm0 C copy mask register + movdqa %xmm7, %xmm2 C copy count register + movdqa %xmm7, %xmm3 C copy count register + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm1 +L(e5): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm1 +L(e3): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e2): lddqu 48(up), %xmm1 + add $64, up +L(e1): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts + paddb %xmm2, %xmm3 + paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts + movdqa %xmm3, %xmm4 + sub $8, n + jg L(top) + + psadbw %xmm5, %xmm4 + paddq %xmm4, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) diff --git a/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm new file mode 100644 index 0000000..8c296fd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/redc_1.asm @@ -0,0 +1,430 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core 4.5 (fluctuating) +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. +C * Keep up[i] in registers for basecases (might require pushes). + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C X q0' n X rp up u0i mp q0 i j + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea -16(up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(b0) + +L(b1): cmp $-1, R32(n) + jz L(n1) + cmp $-3, R32(n) + jz L(n3) + + push rp + +L(otp1):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + lea (%rax), %rbp + mov 8(mp,n,8), %rax + lea (%rdx), %r9 + mul q0 + lea (%rax), %r11 + mov 16(mp,n,8), %rax + mov 16(up,n,8), %r10 + lea (%rdx), %rdi + mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov 24(mp,n,8), %rax + adc %r9, %r11 + mov 24(up,n,8), %rbx + lea (%rdx), %r9 + adc $0, %rdi + mul q0 + add %r11, %rbx + lea (%rax), %r11 + mov 32(mp,n,8), %rax + adc %rdi, %rbp + mov %rbx, 24(up,n,8) + mov 32(up,n,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + imul u0inv, %rbx C next q limb + add $2, i + jns L(ed1) + + ALIGNx +L(tp1): mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %r10, -8(up,i,8) + mov (up,i,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 8(mp,i,8), %rax + adc %rdi, %rbp + mov %r10, (up,i,8) + mov 8(up,i,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + add $2, i + js L(tp1) + +L(ed1): mul q0 + add %rbp, %r10 + adc %r9, %r11 + mov %r10, I(-8(up),-8(up,i,8)) + mov I((up),(up,i,8)), %r10 + adc $0, %rdi + add %r11, %r10 + adc %rdi, %rax + mov %r10, I((up),(up,i,8)) + mov I(8(up),8(up,i,8)), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, I(8(up),8(up,i,8)) + adc $0, %rdx + mov %rdx, 16(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b0): cmp $-2, R32(n) + jz L(n2) + cmp $-4, R32(n) + jz L(n4) + + push rp + +L(otp0):lea 4(n), i + mov (mp,n,8), %rax + mul q0 + lea (%rax), %r11 + mov 8(mp,n,8), %rax + lea (%rdx), %rdi + mul q0 + lea (%rax), %rbp + mov 16(mp,n,8), %rax + mov 16(up,n,8), %r10 + lea (%rdx), %r9 + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 24(mp,n,8), %rax + adc %rdi, %rbp + mov 24(up,n,8), %rbx + lea (%rdx), %rdi + adc $0, %r9 + mul q0 + add %rbp, %rbx + lea (%rax), %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rbx, 24(up,n,8) + mov 32(up,n,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + add %rbp, %r10 + lea (%rax), %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %r10, -8(up,i,8) + mov (up,i,8), %r10 + lea (%rdx), %r9 + adc $0, %rdi +L(e0): mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov 8(mp,i,8), %rax + adc %rdi, %rbp + mov %r10, (up,i,8) + mov 8(up,i,8), %r10 + lea (%rdx), %rdi + adc $0, %r9 + add $2, i + js L(tp0) + +L(ed0): mul q0 + add %rbp, %r10 + adc %r9, %r11 + mov %r10, I(-8(up),-8(up,i,8)) + mov I((up),(up,i,8)), %r10 + adc $0, %rdi + add %r11, %r10 + adc %rdi, %rax + mov %r10, I((up),(up,i,8)) + mov I(8(up),8(up,i,8)), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, I(8(up),8(up,i,8)) + adc $0, %rdx + mov %rdx, 16(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + +L(cj): lea 16(up), up C FIXME + pop rp +L(add_n): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(n1): mov (mp_param), %rax + mul q0 + add 8(up), %rax + adc 16(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov (up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov 8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov 16(up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 24(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov (up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov 8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, (up) + add %r9, %r10 + adc $0, %r11 + mov %r10, 8(up) + mov %r11, -8(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -32(up), %rdx + mov -24(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc 8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n4): mov -32(mp), %rax + mul q0 + lea (%rax), %r11 + mov -24(mp), %rax + lea (%rdx), %r14 + mul q0 + lea (%rax), %rbp + mov -16(mp), %rax + mov -16(up), %r10 + lea (%rdx), %r9 + mul q0 + add %r11, %r10 + lea (%rax), %r11 + mov -8(mp), %rax + adc %r14, %rbp + mov -8(up), %rbx + lea (%rdx), %r14 + adc $0, %r9 + mul q0 + add %rbp, %rbx + adc %r9, %r11 + mov %rbx, -8(up) + mov (up), %r10 + adc $0, %r14 + imul u0inv, %rbx C next q limb + add %r11, %r10 + adc %r14, %rax + mov %r10, (up) + mov 8(up), %r10 + adc $0, %rdx + add %rax, %r10 + mov %r10, 8(up) + adc $0, %rdx + mov %rdx, -16(up) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(n4) + lea 16(up), up + jmp L(add_n) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm new file mode 100644 index 0000000..27eed37 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/rsh1aors_n.asm @@ -0,0 +1,169 @@ +dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn. + +dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 3.05 +C Intel NHM 3.3 +C Intel SBR 2.5 +C Intel atom ? +C VIA nano ? + +C TODO +C * Loopmix to approach 2.5 c/l on NHM. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + + neg %r8 C set C flag from parameter + mov (up), %r8 + ADCSBB (vp), %r8 + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %r8 + ADDSUB (vp), %r8 +L(ent): sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %rax + and $1, R32(%rax) C return value + + lea (up,n,8), up + lea (vp,n,8), vp + lea (rp,n,8), rp + mov R32(n), R32(%rbp) + neg n + and $3, R32(%rbp) + jz L(b0) + cmp $2, R32(%rbp) + jae L(n1) + +L(b1): mov %r8, %rbp + inc n + js L(top) + jmp L(end) + +L(n1): jnz L(b3) + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %r10 + add $-2, n + jmp L(2) + +L(b3): add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r10 + mov 16(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r10 + ADCSBB 16(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + mov %r8, %r9 + dec n + jmp L(3) + +L(b0): add R32(%rbx), R32(%rbx) C restore cy + mov 8(up,n,8), %r9 + mov 16(up,n,8), %r10 + mov 24(up,n,8), %r11 + ADCSBB 8(vp,n,8), %r9 + ADCSBB 16(vp,n,8), %r10 + ADCSBB 24(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + jmp L(4) + + ALIGN(16) + +L(top): add R32(%rbx), R32(%rbx) C restore cy + mov (up,n,8), %r8 + mov 8(up,n,8), %r9 + mov 16(up,n,8), %r10 + mov 24(up,n,8), %r11 + ADCSBB (vp,n,8), %r8 + ADCSBB 8(vp,n,8), %r9 + ADCSBB 16(vp,n,8), %r10 + ADCSBB 24(vp,n,8), %r11 + sbb R32(%rbx), R32(%rbx) C save cy + shrd $1, %r8, %rbp + mov %rbp, -8(rp,n,8) +L(4): shrd $1, %r9, %r8 + mov %r8, (rp,n,8) +L(3): shrd $1, %r10, %r9 + mov %r9, 8(rp,n,8) +L(2): shrd $1, %r11, %r10 + mov %r10, 16(rp,n,8) +L(1): add $4, n + mov %r11, %rbp + js L(top) + +L(end): shrd $1, %rbx, %rbp + mov %rbp, -8(rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/rshift.asm b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm new file mode 100644 index 0000000..7578a53 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/rshift.asm @@ -0,0 +1,143 @@ +dnl x86-64 mpn_rshift optimised for Conroe/Penryn and Nehalem. + +dnl Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 1.32 +C Intel NHM 1.30 (drops to 2.5 for n > 256) +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + + xor R32(%rax), R32(%rax) + + test $1, R8(n) + jnz L(bx1) +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): lea 8(up), up + lea -24(rp), rp + mov -8(up), %r10 + mov (up), %r11 + shrd R8(cnt), %r10, %rax + mov 8(up), %r8 + shr $2, n + jmp L(00) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): lea 16(up), up + lea -16(rp), rp + mov -16(up), %r9 + shrd R8(cnt), %r9, %rax + shr $2, n + jz L(1) + mov -8(up), %r10 + mov (up), %r11 + jmp L(01) + +L(b10): lea 24(up), up + lea -8(rp), rp + mov -24(up), %r8 + mov -16(up), %r9 + shrd R8(cnt), %r8, %rax + shr $2, n + jz L(2) + mov -8(up), %r10 + jmp L(10) + +L(b11): lea 32(up), up + mov -32(up), %r11 + mov -24(up), %r8 + mov -16(up), %r9 + shrd R8(cnt), %r11, %rax + shr $2, n + jz L(end) + + ALIGN(16) +L(top): shrd R8(cnt), %r8, %r11 + mov -8(up), %r10 + mov %r11, (rp) +L(10): shrd R8(cnt), %r9, %r8 + mov (up), %r11 + mov %r8, 8(rp) +L(01): shrd R8(cnt), %r10, %r9 + mov 8(up), %r8 + mov %r9, 16(rp) +L(00): shrd R8(cnt), %r11, %r10 + mov 16(up), %r9 + add $32, up + mov %r10, 24(rp) + add $32, rp + dec n + jnz L(top) + +L(end): shrd R8(cnt), %r8, %r11 + mov %r11, (rp) +L(2): shrd R8(cnt), %r9, %r8 + mov %r8, 8(rp) +L(1): shr R8(cnt), %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm new file mode 100644 index 0000000..a112c1b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sqr_basecase.asm @@ -0,0 +1,984 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere. +dnl It also seems good for Conroe/Wolfdale. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core 4.9 4.18-4.25 3.87 +C Intel NHM 3.8 4.06-4.2 3.5 +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C Code structure: +C +C +C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) +C | | | | +C | | | | +C | | | | +C \|/ \|/ \|/ \|/ +C ____________ ____________ +C / \ / \ +C \|/ \ \|/ \ +C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) +C \ /|\ \ /|\ +C \____________/ \____________/ +C \ / +C \ / +C \ / +C tail(0m2) tail(1m2) +C \ / +C \ / +C sqr_diag_addlsh1 + +C TODO +C * Tune. None done so far. +C * Currently 2761 bytes, making it smaller would be nice. +C * Consider using a jumptab-based entry sequence. One might even use a mask- +C less sequence, if the table is large enough to support tuneup's needs. +C The code would be, using non-PIC code, +C lea tab(%rip),%rax; jmp *(n,%rax) +C or, +C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx +C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,.. +C with the last four entries repeated a safe number of times. +C * Consider expanding feed-in code in order to avoid zeroing registers. +C * Zero consistently with xor. +C * Check if using "lea (reg),reg" should be done in more places; we have some +C explicit "mov %rax,reg" now. +C * Try zeroing with xor in m2 loops. +C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication +C between loop header and wind-down code. +C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +C Define this to $1 to use late loop index variable as zero, $2 to use an +C explicit $0. +define(`Z',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') + +define(`n', `%r8') + +define(`v0', `%r10') +define(`v1', `%r11') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r9') +define(`i', `%r13') + +define(`X0', `%r12') +define(`X1', `%r14') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +define(`N', 85) +ifdef(`N',,`define(`N',0)') +define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $4, n_param + jl L(small) + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov (up), v0 + mov 8(up), %rax + mov %rax, v1 + + mov $1, R32(n) + sub n_param, n C n = -n_param+1 + push n + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + mov %rax, (rp,n,8) + jnz L(b10) + +L(b00): lea (n), i C n = 5, 9, ... + mov %rdx, w1 C FIXME: Use lea? + xor R32(w2), R32(w2) + jmp L(m2e0) + +L(b10): lea 2(n), i C n = 7, 11, ... + mov 8(up,n,8), %rax + mov %rdx, w3 C FIXME: Use lea? + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + jmp L(m2e2) + +L(bx1): test $2, R8(n) + mov %rax, (rp,n,8) + jz L(b11) + +L(b01): lea 1(n), i C n = 6, 10, ... + mov %rdx, w0 C FIXME: Use lea? + xor R32(w1), R32(w1) + jmp L(m2e1) + +L(b11): lea -1(n), i C n = 4, 8, 12, ... + mov %rdx, w2 C FIXME: Use lea? + xor R32(w3), R32(w3) + jmp L(m2e3) + + + ALIGNx +L(m2top1): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 +L(m2e1):mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top1) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + jmp L(am2o3) + + ALIGNx +L(m2top3): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 +L(m2e3):mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top3) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + cmp $-1, n + jz L(cor1) C jumps iff entry n = 4 + +L(am2o1): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea 1(n), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 128) + mov (rp,n,8), w1 + xor R32(w2), R32(w2) + mov 8(up,n,8), %rax + xor R32(w3), R32(w3) + jmp L(lo1) + + ALIGNx +L(am2top1): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 +L(lo1): mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top1) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + +L(am2o3): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea -1(n), i + mul v0 + mov %rax, X1 + MOV( %rdx, X0, 8) + mov (rp,n,8), w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov 8(up,n,8), %rax + jmp L(lo3) + + ALIGNx +L(am2top3): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax +L(lo3): mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top3) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + cmp $-1, n + jnz L(am2o1) + +L(cor1):pop n + mov %rdx, w3 + mov -16(up), v0 + mov -8(up), %rax + mul v0 + add w3, %rax + adc $0, %rdx + mov %rax, -8(rp) + mov %rdx, (rp) + jmp L(sqr_diag_addlsh1) + + ALIGNx +L(m2top2): +L(m2e2):mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top2) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + jmp L(am2o0) + + ALIGNx +L(m2top0): + mul v0 + add %rax, w3 + mov -8(up,i,8), %rax + mov w3, -8(rp,i,8) + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add %rax, w0 + adc %rdx, w1 + mov $0, R32(w2) + mov (up,i,8), %rax + mul v0 + add %rax, w0 + mov w0, (rp,i,8) + adc %rdx, w1 + mov (up,i,8), %rax + adc $0, R32(w2) + mul v1 + add %rax, w1 + adc %rdx, w2 +L(m2e0):mov 8(up,i,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 8(up,i,8), %rax + mul v1 + add %rax, w2 + mov w1, 8(rp,i,8) + adc %rdx, w3 + mov $0, R32(w0) + mov 16(up,i,8), %rax + mul v0 + add %rax, w2 + mov 16(up,i,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov $0, R32(w1) + add %rax, w3 + mov 24(up,i,8), %rax + mov w2, 16(rp,i,8) + adc %rdx, w0 + add $4, i + js L(m2top0) + + mul v0 + add %rax, w3 + mov I(-8(up),-8(up,i,8)), %rax + mov w3, I(-8(rp),-8(rp,i,8)) + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w0, %rax + adc w1, %rdx + mov %rax, I((rp),(rp,i,8)) + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n C decrease |n| + cmp $-2, n + jz L(cor2) C jumps iff entry n = 5 + +L(am2o2): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea -2(n), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov (rp,n,8), w0 + xor R32(w1), R32(w1) + xor R32(w2), R32(w2) + mov 8(up,n,8), %rax + jmp L(lo2) + + ALIGNx +L(am2top2): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 + mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 +L(lo2): mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top2) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + +L(am2o0): + mov -8(up,n,8), v0 + mov (up,n,8), %rax + mov %rax, v1 + lea 0(n), i + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 2) + xor R32(w0), R32(w0) + mov (rp,n,8), w2 + xor R32(w3), R32(w3) + jmp L(lo0) + + ALIGNx +L(am2top0): + mul v1 + add w0, w1 + adc %rax, w2 + mov (up,i,8), %rax + MOV( %rdx, w3, 1) + adc $0, w3 + mul v0 + add w1, X1 + mov X1, -8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 2) + adc $0, X1 + mov (up,i,8), %rax + mul v1 + MOV( %rdx, w0, 4) + mov (rp,i,8), w1 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo0): mov 8(up,i,8), %rax + mul v0 + add w2, X0 + adc %rax, X1 + mov X0, (rp,i,8) + MOV( %rdx, X0, 8) + adc $0, X0 + mov 8(up,i,8), %rax + mov 8(rp,i,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + MOV( %rdx, w1, 16) + adc $0, w1 + mov 16(up,i,8), %rax + mul v0 + add w3, X1 + mov X1, 8(rp,i,8) + adc %rax, X0 + MOV( %rdx, X1, 32) + mov 16(rp,i,8), w3 + adc $0, X1 + mov 16(up,i,8), %rax + mul v1 + add w3, w0 + MOV( %rdx, w2, 64) + adc %rax, w1 + mov 24(up,i,8), %rax + adc $0, w2 + mul v0 + add w0, X0 + mov X0, 16(rp,i,8) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov 24(up,i,8), %rax + mov 24(rp,i,8), w0 + adc $0, X0 + add $4, i + jnc L(am2top0) + + mul v1 + add w0, w1 + adc w2, %rax + adc Z(i,$0), %rdx + add w1, X1 + adc Z(i,$0), X0 + mov X1, I(-8(rp),-8(rp,i,8)) + add X0, %rax + mov %rax, I((rp),(rp,i,8)) + adc Z(i,$0), %rdx + mov %rdx, I(8(rp),8(rp,i,8)) + + lea 16(rp), rp + add $2, n + cmp $-2, n + jnz L(am2o2) + +L(cor2):pop n + mov -24(up), v0 + mov %rax, w2 + mov %rdx, w0 + mov -16(up), %rax + mov %rax, v1 + mul v0 + mov %rax, X0 + MOV( %rdx, X1, 32) + mov -8(up), %rax + mul v0 + add w2, X0 + mov X0, -16(rp) + MOV( %rdx, X0, 128) + adc %rax, X1 + mov -8(up), %rax + adc $0, X0 + mul v1 + add w0, X1 + adc $0, X0 + mov X1, -8(rp) + add X0, %rax + mov %rax, (rp) + adc $0, %rdx + mov %rdx, 8(rp) + lea 8(rp), rp + +L(sqr_diag_addlsh1): + mov -8(up,n,8), %rax + shl n + xor R32(%rbx), R32(%rbx) + mul %rax + mov 8(rp,n,8), %r11 + lea (%rdx), %r10 + mov 16(rp,n,8), %r9 + add %r11, %r11 + jmp L(dm) + + ALIGNx +L(dtop):mul %rax + add %r11, %r10 + mov 8(rp,n,8), %r11 + mov %r10, -8(rp,n,8) + adc %r9, %rax + lea (%rdx,%rbx), %r10 + mov 16(rp,n,8), %r9 + adc %r11, %r11 +L(dm): mov %rax, (rp,n,8) + mov (up,n,4), %rax + adc %r9, %r9 + setc R8(%rbx) + add $2, n + js L(dtop) + + mul %rax + add %r11, %r10 + mov %r10, -8(rp) + adc %r9, %rax + lea (%rdx,%rbx), %r10 + mov %rax, (rp) + adc $0, %r10 + mov %r10, 8(rp) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + mov (up), %rax + cmp $2, n_param + jae L(gt1) +L(n1): + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) +L(n2): mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(gt2): +L(n3): mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm new file mode 100644 index 0000000..46488fc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh1_n.asm @@ -0,0 +1,47 @@ +dnl AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +define(ADDSUB, sub) +define(ADCSBB, sbb) +define(func, mpn_sublsh1_n) + +MULFUNC_PROLOGUE(mpn_sublsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm new file mode 100644 index 0000000..f3b1e28 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublsh2_n.asm @@ -0,0 +1,47 @@ +dnl AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +define(ADDSUB, sub) +define(ADCSBB, sbb) +define(func, mpn_sublsh2_n) + +MULFUNC_PROLOGUE(mpn_sublsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/core2/sublshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm new file mode 100644 index 0000000..272700d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/sublshC_n.asm @@ -0,0 +1,158 @@ +dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << C), optimised for Core 2 and +dnl Core iN. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C AMD K8,K9 4.25 +C AMD K10 ? +C Intel P4 ? +C Intel core2 3 +C Intel NHM 3.1 +C Intel SBR 2.47 +C Intel atom ? +C VIA nano ? + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %r12 + + mov R32(%rcx), R32(%rax) + lea 24(up,n,8), up + lea 24(vp,n,8), vp + lea 24(rp,n,8), rp + neg n + + xor R32(%r11), R32(%r11) + + mov -24(vp,n,8), %r8 C do first limb early + shrd $RSH, %r8, %r11 + + and $3, R32(%rax) + je L(b0) + cmp $2, R32(%rax) + jc L(b1) + je L(b2) + +L(b3): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -8(vp,n,8), %r10 + shrd $RSH, %r10, %r9 + mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + mov -8(up,n,8), %r12 + ADCSBB %r9, %r12 + mov %r12, -8(rp,n,8) + mov %r10, %r11 + sbb R32(%rax), R32(%rax) C save cy + add $3, n + js L(top) + jmp L(end) + +L(b1): mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov %r8, %r11 + sbb R32(%rax), R32(%rax) C save cy + inc n + js L(top) + jmp L(end) + +L(b2): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -24(up,n,8), %r12 + ADDSUB %r11, %r12 + mov %r12, -24(rp,n,8) + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + mov %r9, %r11 + sbb R32(%rax), R32(%rax) C save cy + add $2, n + js L(top) + jmp L(end) + + ALIGN(16) +L(top): mov -24(vp,n,8), %r8 + shrd $RSH, %r8, %r11 +L(b0): mov -16(vp,n,8), %r9 + shrd $RSH, %r9, %r8 + mov -8(vp,n,8), %r10 + shrd $RSH, %r10, %r9 + mov (vp,n,8), %rbx + shrd $RSH, %rbx, %r10 + + add R32(%rax), R32(%rax) C restore cy + + mov -24(up,n,8), %r12 + ADCSBB %r11, %r12 + mov %r12, -24(rp,n,8) + + mov -16(up,n,8), %r12 + ADCSBB %r8, %r12 + mov %r12, -16(rp,n,8) + + mov -8(up,n,8), %r12 + ADCSBB %r9, %r12 + mov %r12, -8(rp,n,8) + + mov (up,n,8), %r12 + ADCSBB %r10, %r12 + mov %r12, (rp,n,8) + + mov %rbx, %r11 + sbb R32(%rax), R32(%rax) C save cy + + add $4, n + js L(top) + +L(end): shr $RSH, %r11 + pop %r12 + pop %rbx + sub R32(%r11), R32(%rax) + neg R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm new file mode 100644 index 0000000..8d3a44a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/addmul_1.asm @@ -0,0 +1,210 @@ +dnl AMD64 mpn_addmul_1 optimised for Intel Broadwell. + +dnl Copyright 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 n/a +C AMD bd3 n/a +C AMD bd4 ? +C AMD zen1 ? +C AMD zen2 ? +C AMD zen3 1.5 +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL 1.67 1.74 +C Intel SKL 1.63 1.71 +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Put an initial mulx before switching, targeting some free registers. +C * Tune feed-in code. +C * Trim nop execution after L(f2). +C * For DOS64, fix nop execution. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl IFDOS(` define(`up', ``%rsi'') ') dnl +dnl IFDOS(` define(`rp', ``%rcx'') ') dnl +dnl IFDOS(` define(`vl', ``%r9'') ') dnl +dnl IFDOS(` define(`r9', ``rdi'') ') dnl +dnl IFDOS(` define(`n', ``%r8'') ') dnl +dnl IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_1) + FUNC_ENTRY(4) + + mov v0_param, %r10 + mov n_param, n + mov R32(n_param), R32(%r8) + shr $3, n + and $7, R32(%r8) C clear OF, CF as side-effect + mov %r10, %rdx + lea L(tab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%r8,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%r8,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f1), L(tab)) + JMPENT( L(f2), L(tab)) + JMPENT( L(f3), L(tab)) + JMPENT( L(f4), L(tab)) + JMPENT( L(f5), L(tab)) + JMPENT( L(f6), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea -1(n), n + jmp L(b0) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jmp L(b3) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + jmp L(b4) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + jmp L(b5) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + jmp L(b6) + +L(f1): mulx( (up), %r9, %rax) + jrcxz L(1) + jmp L(b1) +L(1): add (rp), %r9 + mov %r9, (rp) + adc %rcx, %rax C relies on rcx = 0 + FUNC_EXIT() + ret + +L(end): adox( (rp), %r9) + mov %r9, (rp) + adox( %rcx, %rax) C relies on rcx = 0 + adc %rcx, %rax C relies on rcx = 0 + FUNC_EXIT() + ret + +ifdef(`PIC', +` nop;nop;nop;nop', +` nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop') + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + + ALIGN(32) +L(top): adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + adox( (rp), %r9) + lea -1(n), n + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + adox( 8,(rp), %r10) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + adox( 16,(rp), %r9) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adox( 24,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + adox( 32,(rp), %r9) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adox( 40,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + jmp L(b7) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h new file mode 100644 index 0000000..91c91b5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/gmp-mparam.h @@ -0,0 +1,246 @@ +/* Broadwell gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */ +/* FFT tuning limit = 467,964,472 */ +/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 24 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 455 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 202 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 152 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 198 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 426 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 46 + +#define MULMOD_BNM1_THRESHOLD 16 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39, 8}, \ + { 79,10}, { 23, 9}, { 55,11}, { 15,10}, \ + { 31, 9}, { 71,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 87,11}, { 47,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ + { 95,10}, { 199,11}, { 111,12}, { 63, 8}, \ + { 1087,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,13}, { 639,12}, { 1279,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,13}, { 1407,14}, { 767,13}, { 1535,12}, \ + { 3071,13}, { 1599,12}, { 3199,13}, { 1663,14}, \ + { 895,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,12}, { 6911,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8703,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,17}, { 2047,16}, { 4095,15}, \ + { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 219 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 400, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 28, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127,11}, { 79,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ + { 95, 8}, { 1599, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 735,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 799,13}, \ + { 447,12}, { 959,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2175,13}, \ + { 4351,14}, { 2303,13}, { 4607,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \ + { 9471,14}, { 18943,15}, { 9983,14}, { 19967,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 215 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 80 +#define MULLO_MUL_N_THRESHOLD 11025 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 109 +#define SQRLO_SQR_THRESHOLD 7293 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 183 +#define DC_BDIV_QR_THRESHOLD 86 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 58 +#define INV_NEWTON_THRESHOLD 171 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_2_THRESHOLD 33 +#define REDC_2_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 67 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1866 + +#define POWM_SEC_TABLE 2,10,191,494,712,1378 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 644 +#define SET_STR_PRECOMPUTE_THRESHOLD 1658 + +#define FAC_DSC_THRESHOLD 562 +#define FAC_ODD_THRESHOLD 48 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 5 /* 0.38% faster than 3 */ +#define HGCD_THRESHOLD 73 +#define HGCD_APPR_THRESHOLD 67 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 630 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 1 /* 29.65% faster than 4 */ + +/* Tuneup completed successfully, took 239050 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm new file mode 100644 index 0000000..b7fae2f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_1.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_mul_1 optimised for Intel Broadwell. + +dnl Copyright 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core2 - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 1.70 +C Intel BWL 1.51 +C Intel SKL 1.52 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Put an initial mulx before switching, targeting some free registers. +C * Tune feed-in code. +C * Trim nop execution after L(f2). +C * Port to DOS64, not forgetting nop execution. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') + +dnl ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl IFDOS(` define(`up', ``%rsi'') ') dnl +dnl IFDOS(` define(`rp', ``%rcx'') ') dnl +dnl IFDOS(` define(`vl', ``%r9'') ') dnl +dnl IFDOS(` define(`r9', ``rdi'') ') dnl +dnl IFDOS(` define(`n', ``%r8'') ') dnl +dnl IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_1) + + mov v0_param, %r10 + mov n_param, n + mov R32(n_param), R32(%r8) + shr $3, n + and $7, R32(%r8) C clear OF, CF as side-effect + mov %r10, %rdx + lea L(tab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%r8,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%r8,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f1), L(tab)) + JMPENT( L(f2), L(tab)) + JMPENT( L(f3), L(tab)) + JMPENT( L(f4), L(tab)) + JMPENT( L(f5), L(tab)) + JMPENT( L(f6), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +L(f0): mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + jmp L(b0) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(b3) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(b4) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(b5) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(b6) + +L(f7): mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(b7) + +L(f1): mulx( (up), %r9, %rax) + test n, n + jnz L(b1) +L(1): mov %r9, (rp) + ret + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + test n, n + jz L(end) + + ALIGN(32) +L(top): mov %r10, -8(rp) + adc %r8, %r9 +L(b1): mulx( 8,(up), %r10, %r8) + adc %rax, %r10 + lea 64(up), up + mov %r9, (rp) +L(b0): mov %r10, 8(rp) + mulx( -48,(up), %r9, %rax) + lea 64(rp), rp + adc %r8, %r9 +L(b7): mulx( -40,(up), %r10, %r8) + mov %r9, -48(rp) + adc %rax, %r10 +L(b6): mov %r10, -40(rp) + mulx( -32,(up), %r9, %rax) + adc %r8, %r9 +L(b5): mulx( -24,(up), %r10, %r8) + mov %r9, -32(rp) + adc %rax, %r10 +L(b4): mulx( -16,(up), %r9, %rax) + mov %r10, -24(rp) + adc %r8, %r9 +L(b3): mulx( -8,(up), %r10, %r8) + adc %rax, %r10 + mov %r9, -16(rp) + dec n + mulx( (up), %r9, %rax) + jnz L(top) + +L(end): mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + adc %rcx, %rax + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm new file mode 100644 index 0000000..7ca5a9b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mul_basecase.asm @@ -0,0 +1,368 @@ +dnl AMD64 mpn_mul_basecase optimised for Intel Broadwell. + +dnl Copyright 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zen ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Do overlapped software pipelining. +C * When changing this, make sure the code which falls into the inner loops +C does not execute too many no-ops (for both PIC and non-PIC). + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp_param',`%rcx') +define(`vn', `%r8') + +define(`n', `%rcx') +define(`n_save', `%rbp') +define(`vp', `%r14') +define(`unneg', `%rbx') +define(`v0', `%rdx') +define(`jaddr', `%rax') + +define(`w0', `%r12') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + cmp $2, un_param + ja L(gen) + mov (vp_param), %rdx + mulx( (up), %rax, %r9) C 0 1 + je L(s2x) + +L(s11): mov %rax, (rp) + mov %r9, 8(rp) + FUNC_EXIT() + ret + +L(s2x): cmp $2, vn + mulx( 8,(up), %r8, %r10) C 1 2 + je L(s22) + +L(s21): add %r8, %r9 + adc $0, %r10 + mov %rax, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + FUNC_EXIT() + ret + +L(s22): add %r8, %r9 C 1 + adc $0, %r10 C 2 + mov 8(vp_param), %rdx + mov %rax, (rp) + mulx( (up), %r8, %r11) C 1 2 + mulx( 8,(up), %rax, %rdx) C 2 3 + add %r11, %rax C 2 + adc $0, %rdx C 3 + add %r8, %r9 C 1 + adc %rax, %r10 C 2 + adc $0, %rdx C 3 + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(gen): + push %rbx + push %rbp + push %r12 + push %r14 + + mov vp_param, vp + lea 1(un_param), unneg + mov un_param, n_save + mov R32(un_param), R32(%rax) + and $-8, unneg + shr $3, n_save C loop count + neg unneg + and $7, R32(%rax) C clear CF for adc as side-effect + C note that rax lives very long + mov n_save, n + mov (vp), v0 + lea 8(vp), vp + + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r11 + lea (%r11, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( (up), w2, w3) + lea 56(up), up + lea -8(rp), rp + jmp L(mb0) + +L(mf3): mulx( (up), w0, w1) + lea 16(up), up + lea 16(rp), rp + inc n + jmp L(mb3) + +L(mf4): mulx( (up), w2, w3) + lea 24(up), up + lea 24(rp), rp + inc n + jmp L(mb4) + +L(mf5): mulx( (up), w0, w1) + lea 32(up), up + lea 32(rp), rp + inc n + jmp L(mb5) + +L(mf6): mulx( (up), w2, w3) + lea 40(up), up + lea 40(rp), rp + inc n + jmp L(mb6) + +L(mf7): mulx( (up), w0, w1) + lea 48(up), up + lea 48(rp), rp + inc n + jmp L(mb7) + +L(mf1): mulx( (up), w0, w1) + jmp L(mb1) + +L(mf2): mulx( (up), w2, w3) + lea 8(up), up + lea 8(rp), rp + mulx( (up), w0, w1) + + ALIGN(16) +L(m1top): + mov w2, -8(rp) + adc w3, w0 +L(mb1): mulx( 8,(up), w2, w3) + adc w1, w2 + lea 64(up), up + mov w0, (rp) +L(mb0): mov w2, 8(rp) + mulx( -48,(up), w0, w1) + lea 64(rp), rp + adc w3, w0 +L(mb7): mulx( -40,(up), w2, w3) + mov w0, -48(rp) + adc w1, w2 +L(mb6): mov w2, -40(rp) + mulx( -32,(up), w0, w1) + adc w3, w0 +L(mb5): mulx( -24,(up), w2, w3) + mov w0, -32(rp) + adc w1, w2 +L(mb4): mulx( -16,(up), w0, w1) + mov w2, -24(rp) + adc w3, w0 +L(mb3): mulx( -8,(up), w2, w3) + adc w1, w2 + mov w0, -16(rp) + dec n + mulx( (up), w0, w1) + jnz L(m1top) + +L(m1end): + mov w2, -8(rp) + adc w3, w0 + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + + dec vn + jz L(done) + + lea L(atab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), jaddr +',` + mov (%r10,%rax,8), jaddr +') + +L(outer): + lea (up,unneg,8), up + mov n_save, n + mov (vp), v0 + lea 8(vp), vp + jmp *jaddr + +L(f0): mulx( 8,(up), w2, w3) + lea 8(rp,unneg,8), rp + lea -1(n), n + jmp L(b0) + +L(f3): mulx( -16,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b3) + +L(f4): mulx( -24,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b4) + +L(f5): mulx( -32,(up), w0, w1) + lea -56(rp,unneg,8), rp + jmp L(b5) + +L(f6): mulx( -40,(up), w2, w3) + lea -56(rp,unneg,8), rp + jmp L(b6) + +L(f7): mulx( 16,(up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b7) + +L(f1): mulx( (up), w0, w1) + lea 8(rp,unneg,8), rp + jmp L(b1) + +L(am1end): + adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 + mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + + dec vn C clear OF as side-effect + jnz L(outer) +L(done): + pop %r14 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(f2): mulx( -8,(up), w2, w3) + lea 8(rp,unneg,8), rp + mulx( (up), w0, w1) + + ALIGN(16) +L(am1top): + adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(am1end) +L(b1): mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea -1(n), n + mov w0, (rp) + adcx( w1, w2) +L(b0): mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) +L(b7): mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) +L(b6): mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) +L(b5): mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) +L(b4): mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) +L(b3): adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(am1top) + + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) + JMPENT( L(mf7), L(mtab)) +L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + TEXT +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm new file mode 100644 index 0000000..5cdb209 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/mullo_basecase.asm @@ -0,0 +1,395 @@ +dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`jmpreg',`%rbx') +define(`nn', `%rbp') + +C TODO +C * Suppress more rp[] rewrites in corner. +C * Rearrange feed-in jumps for short branch forms. +C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since +C feed-in code implodes, the blow-up will not be more than perhaps 4x. +C * Micro-optimise critical lead-in code block around L(ent). +C * Write n < 4 code specifically for Broadwell (current code is for Haswell). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, R32(n) + jae L(big) + + mov vp_param, vp + mov (up), %rdx + + cmp $2, R32(n) + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r8 + mov (up), %rdx + mulx( %r8, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r8 C u1 x v1 + add %r8, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(big): push %r14 + push %r12 + push %rbx + push %rbp + mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end + imul (up), %r14 C FIXME Put at absolute end + lea -3(n), R32(nn) + lea 8(vp_param), vp + mov (vp_param), %rdx + + mov R32(n), R32(%rax) + shr $3, R32(n) + and $7, R32(%rax) C clear OF, CF as side-effect + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(mb0) + +L(mf3): mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + jrcxz L(mc) + inc R32(n) + lea L(f2)(%rip), jmpreg + jmp L(mb3) + +L(mc): mulx( -8,(up), %r10, %r8) + add %rax, %r10 + mov %r9, -16(rp) + mulx( (up), %r9, %rax) + mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + jmp L(c2) + +L(mf4): mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc R32(n) + lea L(f3)(%rip), jmpreg + jmp L(mb4) + +L(mf5): mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc R32(n) + lea L(f4)(%rip), jmpreg + jmp L(mb5) + +L(mf6): mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc R32(n) + lea L(f5)(%rip), jmpreg + jmp L(mb6) + +L(mf7): mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(mb7) + +L(mf1): mulx( (up), %r9, %rax) + lea L(f0)(%rip), jmpreg + jmp L(mb1) + +L(mf2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + lea L(f1)(%rip), jmpreg + mulx( (up), %r9, %rax) + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(mtop):mov %r10, -8(rp) + adc %r8, %r9 +L(mb1): mulx( 8,(up), %r10, %r8) + adc %rax, %r10 + lea 64(up), up + mov %r9, (rp) +L(mb0): mov %r10, 8(rp) + mulx( -48,(up), %r9, %rax) + lea 64(rp), rp + adc %r8, %r9 +L(mb7): mulx( -40,(up), %r10, %r8) + mov %r9, -48(rp) + adc %rax, %r10 +L(mb6): mov %r10, -40(rp) + mulx( -32,(up), %r9, %rax) + adc %r8, %r9 +L(mb5): mulx( -24,(up), %r10, %r8) + mov %r9, -32(rp) + adc %rax, %r10 +L(mb4): mulx( -16,(up), %r9, %rax) + mov %r10, -24(rp) + adc %r8, %r9 +L(mb3): mulx( -8,(up), %r10, %r8) + adc %rax, %r10 + mov %r9, -16(rp) + dec R32(n) + mulx( (up), %r9, %rax) + jnz L(mtop) + +L(mend):mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + adc %rcx, %rax + + lea 8(,nn,8), %r12 + neg %r12 + shr $3, R32(nn) + jmp L(ent) + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(b0) + +L(f1): mulx( (up), %r9, %rax) + lea -1(nn), R32(nn) + lea L(f0)(%rip), jmpreg + jmp L(b1) + +L(end): adox( (rp), %r9) + mov %r9, (rp) + adox( %rcx, %rax) C relies on rcx = 0 + adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits + lea 8(%r12), %r12 +L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?) + add %rax, %r14 + add %r10, %r14 C h + lea (up,%r12), up C reset up + lea 8(rp,%r12), rp C reset rp + mov (vp), %rdx + lea 8(vp), vp + or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior) + jmp *jmpreg + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(b7) + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + lea L(f1)(%rip), jmpreg + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(top): adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + adox( (rp), %r9) + lea -1(n), R32(n) + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + adox( 8,(rp), %r10) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + adox( 16,(rp), %r9) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adox( 24,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + adox( 32,(rp), %r9) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adox( 40,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + lea L(f5)(%rip), jmpreg + jmp L(b6) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + lea L(f4)(%rip), jmpreg + jmp L(b5) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + lea L(f3)(%rip), jmpreg + jmp L(b4) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jrcxz L(cor) + lea L(f2)(%rip), jmpreg + jmp L(b3) + +L(cor): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) C FIXME suppress + adox( (rp), %r9) + mov %r9, (rp) C FIXME suppress + adox( %rcx, %rax) +L(c2): + mulx( 8,(up), %r10, %r8) + adc %rax, %r14 + add %r10, %r14 + mov (vp), %rdx + test R32(%rcx), R32(%rcx) + mulx( -16,(up), %r10, %r8) + mulx( -8,(up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + adox( (rp), %r9) + adox( %rcx, %rax) + adc %rcx, %rax + mulx( (up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov 8(vp), %rdx + mulx( -16,(up), %rcx, %rax) + add %r9, %rcx + mov %rcx, (rp) + adc $0, %rax + mulx( -8,(up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov %r14, 8(rp) + pop %rbp + pop %rbx + pop %r12 + pop %r14 + FUNC_EXIT() + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm new file mode 100644 index 0000000..ff35124 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm @@ -0,0 +1,710 @@ +dnl AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell. + +dnl Copyright 2015, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zn1 ? ? +C AMD zn2 ? ? +C AMD zn3 ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Do overlapped software pipelining. +C * Reduce register use, i.e., by combining n_neg and n_save. +C * Supporess initial store through up, it's always a zero. +C * Streamline up and dp setup. +C * When changing this, make sure the code which falls into the inner loops +C does not execute too many no-ops (for both PIC and non-PIC). + +dnl mp_limb_t +dnl mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un, +dnl mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) + +define(`up', `%rdi') +define(`un', `%rsi') +define(`dp_param',`%rdx') +define(`dn_param',`%rcx') +define(`dinv', `%r8') + +define(`n', `%rcx') +define(`n_save', `%rbp') +define(`dp', `%r14') +define(`n_neg', `%rbx') +define(`q', `%rdx') +define(`jaddr', `%rax') + +define(`w0', `%r12') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ifdef(`MAX_SPECIAL',,` +define(`MAX_SPECIAL', 8)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sbpi1_bdiv_r) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + + lea L(atab)(%rip), %r10 + + cmp $MAX_SPECIAL, dn_param + jbe L(sma) + +ifelse(MAX_SPECIAL,8,,` +forloop(i,eval(MAX_SPECIAL+1),9,`L(i): +')') + +L(gen): push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + + sub dn_param, un C outer loop count + + lea -8(,dn_param,8), n_neg + neg n_neg + mov dn_param, n_save + mov R32(dn_param), R32(%rax) + shr $3, n_save C loop count + and $7, R32(%rax) C clear CF and OF as side-effect + +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax,%r10), jaddr +',` + mov (%r10,%rax,8), jaddr +') + mov (up), q + imul dinv, q + jmp L(outer) + +L(f0): mulx( (dp), w2, w3) + lea -1(n), n + mulx( 8,(dp), w0, w1) + lea -8(dp), dp + adcx( w3, w0) + adox( (up), w2) + lea -8(up), up + jmp L(b0x) + +L(f3): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -48(up), up + lea 16(dp), dp + jmp L(b3x) + +L(f4): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 24(dp), dp + adox( (up), w2) + lea -40(up), up + adcx( w3, w0) + jmp L(b4x) + +L(f5): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + lea 32(dp), dp + adcx( w1, w2) + adox( (up), w0) + lea -32(up), up + jmp L(b5x) + +L(f6): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 40(dp), dp + adox( (up), w2) + lea -24(up), up + adcx( w3, w0) + jmp L(b6x) + +L(f7): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + lea 48(dp), dp + adcx( w1, w2) + adox( (up), w0) + lea -16(up), up + jmp L(b7x) + +L(f1): mulx( (dp), w0, w1) + mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -1(n), n + jmp L(b1x) + +L(f2): mulx( (dp), w2, w3) + mulx( 8,(dp), w0, w1) + lea 8(dp), dp + adox( (up), w2) + lea 8(up), up + adcx( w3, w0) + jmp L(b2x) + +L(end): adox( (up), w0) + adox( %rcx, w1) C relies on rcx = 0 + mov w0, (up) + adc %rcx, w1 C relies on rcx = 0 + mov 8(up,n_neg), q C Compute next quotient early... + mulx( dinv, q, %r12) C ...(unused in last iteration) + bt $0, R32(%r13) + adc w1, 8(up) + setc R8(%r13) + dec un C clear OF as side-effect + jz L(done) + + lea (dp,n_neg), dp C reset dp to D[]'s beginning + lea 8(up,n_neg), up C point up to U[]'s current beginning +L(outer): + mov n_save, n + test %eax, %eax C clear CF and OF + jmp *jaddr + + ALIGN(16) +L(top): adox( -8,(up), w2) + adcx( w3, w0) + mov w2, -8(up) + jrcxz L(end) +L(b2x): mulx( 8,(dp), w2, w3) + adox( (up), w0) + lea -1(n), n + mov w0, (up) +L(b1x): adcx( w1, w2) + mulx( 16,(dp), w0, w1) + adcx( w3, w0) + adox( 8,(up), w2) + mov w2, 8(up) +L(b0x): mulx( 24,(dp), w2, w3) + lea 64(dp), dp + adcx( w1, w2) + adox( 16,(up), w0) + mov w0, 16(up) +L(b7x): mulx( -32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov w2, 24(up) +L(b6x): mulx( -24,(dp), w2, w3) + adcx( w1, w2) + adox( 32,(up), w0) + mov w0, 32(up) +L(b5x): mulx( -16,(dp), w0, w1) + adox( 40,(up), w2) + adcx( w3, w0) + mov w2, 40(up) +L(b4x): adox( 48,(up), w0) + mulx( -8,(dp), w2, w3) + mov w0, 48(up) +L(b3x): lea 64(up), up + adcx( w1, w2) + mulx( (dp), w0, w1) + jmp L(top) + +L(done):mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(sma): +ifdef(`PIC', +` movslq 28(%r10,dn_param,4), %rax + lea (%rax,%r10), jaddr +',` + mov 56(%r10,dn_param,8), jaddr +') + jmp *jaddr + +L(1): mov (dp_param), %r10 + xor R32(%rax), R32(%rax) + mov (up), %rdx + dec un + mov %rdx, %r9 +L(o1): mulx( dinv, %rdx, %r11) C next quotient + lea 8(up), up + mulx( %r10, %rcx, %rdx) C 0 1 + add %r9, %rcx C 0 + adc %rax, %rdx C 1 + add (up), %rdx C 1 + setc R8(%rax) C 2 + mov %rdx, %r9 C 1 + dec un + jnz L(o1) + mov %r9, (up) + + FUNC_EXIT() + ret + +ifdef(`VER',,`define(`VER',1)') +L(2): push %r12 + push %r14 + + mov dp_param, dp C free up rdx + sub dn_param, un C loop count + mov (up), q + imul dinv, q + +ifelse(VER,0,` + xor R32(%rax), R32(%rax) +L(o2): test %eax, %eax C clear CF and OF + mulx( (dp), w2, w3) C 0 1 + mulx( 8,(dp), %rdx, w1) C 1 2 + add (up), w2 C 0 + adc 8(up), %rdx C 1 + adc $0, w1 C 2 cannot carry further + add w3, %rdx C 1 + mov %rdx, 8(up) C 1 + adc $0, w1 C 2 + imul dinv, q C + bt $0, R32(%rax) + adc 16(up), w1 C 2 + mov w1, 16(up) + setc R8(%rax) + lea 8(up), up + dec un + jnz L(o2) +') +ifelse(VER,1,` + push %rbx + push %r13 + xor R32(%r13), R32(%r13) + mov (up), %rax + mov 8(up), %rbx +L(o2): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) C 0 1 + mulx( 8,(dp), %rdx, w1) C 1 2 + adox( %rax, w2) C 0 + adcx( w3, %rdx) C 1 + adox( %rbx, %rdx) C 1 + adox( %rcx, w1) C 2 cannot carry further + mov %rdx, %rax C 1 + adc %rcx, w1 C 2 + imul dinv, q C + bt $0, R32(%r13) + adc 16(up), w1 C 2 + mov w1, %rbx + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o2) + + mov %rax, (up) + mov %rbx, 8(up) + mov %r13, %rax + pop %r13 + pop %rbx +') +ifelse(VER,2,` + xor R32(%rax), R32(%rax) + mov (up), %r10 + mov 8(up), %r9 +L(o2): mulx( (dp), %r12, %r11) + mulx( 8,(dp), %rdx, %rcx) + add %r11, %rdx C 1 + adc $0, %rcx C 2 + add %r10, %r12 C 0 add just to produce carry + adc %r9, %rdx C 1 + mov %rdx, %r10 C 1 + mulx( dinv, %rdx, %r12) C next quotient + adc %rax, %rcx C 2 + setc R8(%rax) C 3 + mov 16(up), %r9 C 2 + add %rcx, %r9 C 2 + adc $0, R32(%rax) C 3 + lea 8(up), up + dec un + jnz L(o2) + + mov %r10, (up) + mov %r9, 8(up) +') +ifelse(VER,3,` + xor R32(%rax), R32(%rax) + mov (up), %r10 + mov 8(up), %r9 +L(o2): mulx( (dp), %r12, %r11) + add %r10, %r12 C 0 add just to produce carry + mulx( 8,(dp), %rdx, %rcx) + adc %r11, %rdx C 1 + adc $0, %rcx C 2 + add %r9, %rdx C 1 + mov %rdx, %r10 C 1 + mulx( dinv, %rdx, %r12) C next quotient + adc %rax, %rcx C 2 + setc R8(%rax) C 3 + mov 16(up), %r9 C 2 + add %rcx, %r9 C 2 + adc $0, R32(%rax) C 3 + lea 8(up), up + dec un + jnz L(o2) + + mov %r10, (up) + mov %r9, 8(up) +') + pop %r14 + pop %r12 + FUNC_EXIT() + ret + +ifelse(eval(MAX_SPECIAL>=3),1,` +L(3): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o3): xor R32(%rcx), R32(%rcx) C clear rcx, CF, and OF + mulx( (dp), w0, w1) C 0 1 + adox( %rax, w0) C 0 + mulx( 8,(dp), %rax, w3) C 1 2 + adcx( w1, %rax) C 1 + adox( %rbx, %rax) C 1 + mulx( 16,(dp), %rbx, w1) C 2 3 + mov dinv, q C 1 + mulx( %rax, q, w0) + adcx( w3, %rbx) C 2 + adox( 16,(up), %rbx) C 2 + adox( %rcx, w1) C 3 + adc $0, w1 C 3 + bt $0, R32(%r13) + adc w1, 24(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o3) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=4),1,` +L(4): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o4): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + mov dinv, q + mulx( %rax, q, w2) + adox( 16,(up), %rbx) + adcx( w3, w0) + adox( 24,(up), w0) + adox( %rcx, w1) + mov w0, 24(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 32(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o4) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=5),1,` +L(5): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o5): xor R32(%rcx), R32(%rcx) + mulx( (dp), w0, w1) + adox( %rax, w0) + mulx( 8,(dp), %rax, w3) + adcx( w1, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w1) + adcx( w3, %rbx) + adox( 16,(up), %rbx) + mulx( 24,(dp), w2, w3) + adcx( w1, w2) + mulx( 32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 24(up) + adox( 32,(up), w0) + adox( %rcx, w1) + mov w0, 32(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 40(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o5) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=6),1,` +L(6): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp C free up rdx + xor %r13, %r13 + sub dn_param, un C outer loop count + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o6): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + adox( 16,(up), %rbx) + adcx( w3, w0) + adox( 24,(up), w0) + mulx( 32,(dp), w2, w3) + mov w0, 24(up) + adcx( w1, w2) + mulx( 40,(dp), w0, w1) + adox( 32,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 32(up) + adox( 40,(up), w0) + adox( %rcx, w1) + mov w0, 40(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 48(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o6) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=7),1,` +L(7): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp + xor %r13, %r13 + sub dn_param, un + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o7): xor R32(%rcx), R32(%rcx) + mulx( (dp), w0, w1) + adox( %rax, w0) + mulx( 8,(dp), %rax, w3) + adcx( w1, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w1) + adcx( w3, %rbx) + mulx( 24,(dp), w2, w3) + adcx( w1, w2) + adox( 16,(up), %rbx) + mulx( 32,(dp), w0, w1) + adox( 24,(up), w2) + adcx( w3, w0) + mov w2, 24(up) + adox( 32,(up), w0) + mulx( 40,(dp), w2, w3) + mov w0, 32(up) + adcx( w1, w2) + mulx( 48,(dp), w0, w1) + adox( 40,(up), w2) + adcx( w3, w0) + mov w2, 40(up) + mov %rax, q + mulx( dinv, q, w2) + adox( 48,(up), w0) + adox( %rcx, w1) + mov w0, 48(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 56(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o7) + jmp L(esma) +') + +ifelse(eval(MAX_SPECIAL>=8),1,` +L(8): push %rbx + push %r12 + push %r13 + push %r14 + + mov dp_param, dp + xor %r13, %r13 + sub dn_param, un + mov (up), %rax + mov 8(up), %rbx + mov %rax, q + imul dinv, q +L(o8): xor R32(%rcx), R32(%rcx) + mulx( (dp), w2, w3) + adox( %rax, w2) + mulx( 8,(dp), %rax, w1) + adcx( w3, %rax) + adox( %rbx, %rax) + mulx( 16,(dp), %rbx, w3) + adcx( w1, %rbx) + mulx( 24,(dp), w0, w1) + adox( 16,(up), %rbx) + adcx( w3, w0) + mulx( 32,(dp), w2, w3) + adcx( w1, w2) + adox( 24,(up), w0) + mov w0, 24(up) + mulx( 40,(dp), w0, w1) + adox( 32,(up), w2) + adcx( w3, w0) + mov w2, 32(up) + adox( 40,(up), w0) + mulx( 48,(dp), w2, w3) + mov w0, 40(up) + adcx( w1, w2) + mulx( 56,(dp), w0, w1) + adox( 48,(up), w2) + adcx( w3, w0) + mov dinv, q + mulx( %rax, q, w3) + mov w2, 48(up) + adox( 56,(up), w0) + adox( %rcx, w1) + mov w0, 56(up) + adc %rcx, w1 + bt $0, R32(%r13) + adc w1, 64(up) + setc R8(%r13) + lea 8(up), up + dec un + jnz L(o8) + jmp L(esma) +') + +L(esma):mov %rax, (up) + mov %rbx, 8(up) + mov %r13, %rax + pop %r14 + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret + + + JUMPTABSECT + ALIGN(8) +L(atab):JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + JMPENT( L(1), L(atab)) + JMPENT( L(2), L(atab)) + JMPENT( L(3), L(atab)) + JMPENT( L(4), L(atab)) + JMPENT( L(5), L(atab)) + JMPENT( L(6), L(atab)) + JMPENT( L(7), L(atab)) + JMPENT( L(8), L(atab)) + TEXT +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm new file mode 100644 index 0000000..e81b01b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreibwl/sqr_basecase.asm @@ -0,0 +1,839 @@ +dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell. + +dnl Copyright 2015, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 addmul_1 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bd1 n/a n/a +C AMD bd2 n/a n/a +C AMD bd3 n/a n/a +C AMD bd4 ? ? +C AMD zen ? ? +C AMD bt1 n/a n/a +C AMD bt2 n/a n/a +C Intel P4 n/a n/a +C Intel PNR n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.68 n/a +C Intel BWL 1.51 1.67-1.74 +C Intel SKL 1.52 1.63-1.71 +C Intel atom n/a n/a +C Intel SLM n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * We have 8 addmul_1 loops which fall into each other. The idea is to save +C on switching code, since a circularly updated computed goto target will +C hardly allow correct branch prediction. On 2nd thought, we now might make +C each of the 8 loop branches be poorly predicted since they will be +C executed fewer times for each time. With just one addmul_1 loop, the loop +C count will change only once each 8th time. +C * Do overlapped software pipelining. +C * Perhaps load in shrx/sarx, eliminating separate load insn. +C * Schedule add+stored in small n code. +C * Try swapping adox and adcx insn, making mulx have more time to run. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + +define(`n', `%rcx') +define(`un_save', `%rbx') +define(`u0', `%rdx') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, un_param + jae L(gt1) + + mov (up), %rdx + mulx( %rdx, %rax, %rdx) + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rdx + mov 8(up), %rcx + mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 + mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 + mov %rcx, %rdx + mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 + add %r9, %r9 C W 1 + adc %r10, %r10 C W 2 + adc $0, %rdx C W 3 + add %r9, %r8 C W 1 + adc %r11, %r10 C W 2 + adc $0, %rdx C W 3 + mov %rax, (rp) + mov %r8, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, un_param + jae L(gt3) + + push %rbx + mov (up), %rdx + mulx( 8,(up), w2, w3) + mulx( 16,(up), w0, w1) + add w3, w0 + mov 8(up), %rdx + mulx( 16,(up), %rax, w3) + adc %rax, w1 + adc $0, w3 + test R32(%rbx), R32(%rbx) + mov (up), %rdx + mulx( %rdx, %rbx, %rcx) + mov %rbx, (rp) + mov 8(up), %rdx + mulx( %rdx, %rax, %rbx) + mov 16(up), %rdx + mulx( %rdx, %rsi, %rdx) + adcx( w2, w2) + adcx( w0, w0) + adcx( w1, w1) + adcx( w3, w3) + adox( w2, %rcx) + adox( w0, %rax) + adox( w1, %rbx) + adox( w3, %rsi) + mov $0, R32(%r8) + adox( %r8, %rdx) + adcx( %r8, %rdx) + mov %rcx, 8(rp) + mov %rax, 16(rp) + mov %rbx, 24(rp) + mov %rsi, 32(rp) + mov %rdx, 40(rp) + pop %rbx + FUNC_EXIT() + ret + +L(gt3): push %rbx + + lea -3(un_param), R32(un_save) + lea 5(un_param), R32(n) + mov R32(un_param), R32(%rax) + and $-8, R32(un_save) + shr $3, R32(n) C count for mul_1 loop + neg un_save C 8*count and offert for addmul_1 loops + and $7, R32(%rax) C clear CF for adc as side-effect + + mov (up), u0 + + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r8 + lea (%r8, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + lea 64(up), up + add w1, w2 + jmp L(mb0) + +L(mf3): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mov w2, (rp) + mulx( 8,(up), w0, w1) + lea 24(up), up + lea 24(rp), rp + add w3, w0 + jmp L(mb3) + +L(mf4): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 32(up), up + lea 32(rp), rp + add w1, w2 + jmp L(mb4) + +L(mf5): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 40(up), up + lea 40(rp), rp + add w3, w0 + jmp L(mb5) + +L(mf6): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 48(up), up + lea 48(rp), rp + add w1, w2 + jmp L(mb6) + +L(mf7): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 56(up), up + lea 56(rp), rp + add w3, w0 + jmp L(mb7) + +L(mf1): mulx( u0, w2, w3) C up[0]^2 + add u0, u0 + mulx( 8,(up), w0, w1) + mov w2, (rp) + lea 8(up), up + lea 8(rp), rp + add w3, w0 + jmp L(mb1) + +L(mf2): mulx( u0, w0, w1) C up[0]^2 + add u0, u0 + mulx( 8,(up), w2, w3) + mov w0, (rp) + lea 16(up), up + lea 16(rp), rp + dec R32(n) + add w1, w2 + mulx( (up), w0, w1) + + ALIGN(16) +L(top): mov w2, -8(rp) + adc w3, w0 +L(mb1): mulx( 8,(up), w2, w3) + adc w1, w2 + lea 64(up), up +L(mb0): mov w0, (rp) + mov w2, 8(rp) + mulx( -48,(up), w0, w1) + lea 64(rp), rp + adc w3, w0 +L(mb7): mulx( -40,(up), w2, w3) + mov w0, -48(rp) + adc w1, w2 +L(mb6): mov w2, -40(rp) + mulx( -32,(up), w0, w1) + adc w3, w0 +L(mb5): mulx( -24,(up), w2, w3) + mov w0, -32(rp) + adc w1, w2 +L(mb4): mulx( -16,(up), w0, w1) + mov w2, -24(rp) + adc w3, w0 +L(mb3): mulx( -8,(up), w2, w3) + adc w1, w2 + mov w0, -16(rp) + dec R32(n) + mulx( (up), w0, w1) + jnz L(top) + +L(end): mov w2, -8(rp) + adc w3, w0 +C mov w0, (rp) +C adc %rcx, w1 +C mov w1, 8(rp) + + lea L(atab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %r11 + lea (%r11, %r10), %r11 +',` + mov (%r10,%rax,8), %r11 +') + mov $63, R32(%rax) + jmp *%r11 + +L(ed0): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f7): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea -64(up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov (up), w1 C up[-1] + mov 8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + jmp L(b7) + + ALIGN(16) +L(tp0): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed0) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) +L(b0): mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp0) + +L(ed1): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f0): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea -64(up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -8(up), w3 C up[-1] + mov (up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + jmp L(b0) + + ALIGN(16) +L(tp1): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed1) +L(b1): mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp1) + +L(ed2): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f1): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea 8(un_save), un_save + lea -56(rp,un_save,8), rp + mov -16(up), w1 C up[-1] + mov -8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) C FIXME: crossjump? + mulx( (up), w0, w1) + adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jmp L(b1) + + ALIGN(16) +L(tp2): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed2) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) +L(b2): adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp2) + +L(ed3): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f2): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + or R32(un_save), R32(n) + jz L(cor3) + lea -56(rp,un_save,8), rp + mov -24(up), w3 C up[-1] + mov -16(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + jmp L(b2) + + ALIGN(16) +L(tp3): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed3) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) +L(b3): mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp3) + +L(ed4): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f3): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -32(up), w1 C up[-1] + mov -24(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + jmp L(b3) + + ALIGN(16) +L(tp4): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed4) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) +L(b4): mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp4) + +L(ed5): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f4): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -40(up), w3 C up[-1] + mov -32(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + jmp L(b4) + + ALIGN(16) +L(tp5): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed5) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) +L(b5): mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp5) + +L(ed6): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f5): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -48(up), w1 C up[-1] + mov -40(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + jmp L(b5) + + ALIGN(16) +L(tp6): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed6) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) + adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up +L(b6): adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp6) + +L(ed7): adox( (rp), w0) + adox( %rcx, w1) C relies on rcx = 0 +L(f6): mov w0, (rp) + adc %rcx, w1 C relies on rcx = 0 + mov w1, 8(rp) + lea (up,un_save,8), up + mov R32(un_save), R32(n) + lea -56(rp,un_save,8), rp + mov -56(up), w3 C up[-1] + mov -48(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + mulx( -40,(up), w2, w3) + jmp L(b6) + + ALIGN(16) +L(tp7): adox( -8,(rp), w2) + adcx( w3, w0) + mov w2, -8(rp) + jrcxz L(ed7) + mulx( 8,(up), w2, w3) + adox( (rp), w0) + lea 8(n), R32(n) + mov w0, (rp) +L(b7): adcx( w1, w2) + mulx( 16,(up), w0, w1) + adcx( w3, w0) + adox( 8,(rp), w2) + mov w2, 8(rp) + mulx( 24,(up), w2, w3) + lea 64(up), up + adcx( w1, w2) + adox( 16,(rp), w0) + mov w0, 16(rp) + mulx( -32,(up), w0, w1) + adox( 24,(rp), w2) + adcx( w3, w0) + mov w2, 24(rp) + mulx( -24,(up), w2, w3) + adcx( w1, w2) + adox( 32,(rp), w0) + mov w0, 32(rp) + mulx( -16,(up), w0, w1) + adox( 40,(rp), w2) + adcx( w3, w0) + mov w2, 40(rp) + adox( 48,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 48(rp) + lea 64(rp), rp + adcx( w1, w2) + mulx( (up), w0, w1) + jmp L(tp7) + +L(cor3):lea -64(rp), rp + mov -24(up), w3 C up[-1] + mov -16(up), u0 C up[0] + shrx( %rax, w3, w2) + sarx( %rax, w3, w3) + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + lea (w2,u0,2), u0 C "u0" arg in C code + adcx( w3, w0) + adox( 56,(rp), w0) + mulx( -8,(up), w2, w3) + mov w0, 56(rp) + adcx( w1, w2) + mulx( (up), %rbx, w1) + adox( 64,(rp), w2) + adcx( w3, %rbx) + mov w2, 64(rp) + adox( 72,(rp), %rbx) + adox( %rcx, w1) C relies on rcx = 0 + adc %rcx, w1 C relies on rcx = 0 + mov w1, 80(rp) C FIXME +C wd2 + mov -16(up), w1 C up[-1] + mov -8(up), u0 C up[0] + shrx( %rax, w1, w0) + sarx( %rax, w1, w1) + and u0, w1 C "ci" in C code + mulx( u0, w2, w3) C up[0]^2 + lea (w0,u0,2), u0 C "u0" arg in C code + adcx( w1, w2) + mulx( (up), w0, %rax) + adox( %rbx, w2) + adcx( w3, w0) + mov w2, 72(rp) + adox( 80,(rp), w0) + adox( %rcx, %rax) C relies on rcx = 0 + mov w0, 80(rp) + adc %rcx, %rax C relies on rcx = 0 +C wd1 + mov -8(up), w3 C up[-1] + mov (up), u0 C up[0] + sar $63, w3 + and u0, w3 C "ci" in C code + mulx( u0, w0, w1) C up[0]^2 + adcx( w3, w0) + adox( %rax, w0) + mov w0, 88(rp) + adcx( %rcx, w1) + adox( %rcx, w1) + mov w1, 96(rp) + + pop %rbx + FUNC_EXIT() + ret + + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) +L(atab):JMPENT( L(f6), L(atab)) + JMPENT( L(f7), L(atab)) + JMPENT( L(f0), L(atab)) + JMPENT( L(f1), L(atab)) + JMPENT( L(f2), L(atab)) + JMPENT( L(f3), L(atab)) + JMPENT( L(f4), L(atab)) + JMPENT( L(f5), L(atab)) + TEXT +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm new file mode 100644 index 0000000..9d1c405 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/addmul_2.asm @@ -0,0 +1,241 @@ +dnl AMD64 mpn_addmul_2 optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bull n/a +C AMD pile n/a +C AMD steam n/a +C AMD excavator ? +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel core n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL 2.15 +C Intel BWL 2.33 +C Intel SKL 2.22 +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') +define(`X0', `%r12') +define(`X1', `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov n_param, n + shr $2, n + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): mov (rp), X0 + mov 8(rp), X1 + test $2, R8(n_param) + jnz L(b10) + +L(b00): mov (up), %rdx + lea 16(up), up + mulx( v0, %rax, w1) + add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + mov X0, (rp) + add %rax, X1 + adc $0, w2 + mov -8(up), %rdx + lea 16(rp), rp + jmp L(lo0) + +L(b10): mov (up), %rdx + inc n + mulx( v0, %rax, w1) + add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + mov X0, (rp) + mov 16(rp), X0 + add %rax, X1 + adc $0, w2 + xor w0, w0 + jmp L(lo2) + +L(bx1): mov (rp), X1 + mov 8(rp), X0 + test $2, R8(n_param) + jnz L(b11) + +L(b01): mov (up), %rdx + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + mov 8(up), %rdx + mov X1, (rp) + mov 16(rp), X1 + mulx( v0, %rax, w1) + lea 24(rp), rp + lea 24(up), up + jmp L(lo1) + +L(b11): mov (up), %rdx + inc n + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + mov X1, (rp) + mov 8(up), %rdx + mulx( v0, %rax, w1) + lea 8(rp), rp + lea 8(up), up + jmp L(lo3) + + ALIGN(16) +L(top): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + lea 32(rp), rp + add w1, X1 + mov -16(up), %rdx + mov X1, -24(rp) + adc $0, w3 + add w2, X0 + mov -8(rp), X1 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo1): add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + add w3, X0 + mov X0, -16(rp) + adc $0, w1 + add %rax, X1 + adc $0, w2 + add w0, X1 + mov -8(up), %rdx + adc $0, w2 +L(lo0): mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mov (rp), X0 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + add w1, X1 + mov X1, -8(rp) + adc $0, w3 + mov (up), %rdx + add w2, X0 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + add w3, X0 + mov 8(rp), X1 + mov X0, (rp) + mov 16(rp), X0 + adc $0, w1 + add %rax, X1 + adc $0, w2 +L(lo2): mov 8(up), %rdx + lea 32(up), up + dec n + jnz L(top) + +L(end): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 + add %rax, X1 + adc $0, w3 + mulx( v1, %rdx, %rax) + add w1, X1 + mov X1, 8(rp) + adc $0, w3 + add w2, %rdx + adc $0, %rax + add w3, %rdx + mov %rdx, 16(rp) + adc $0, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm new file mode 100644 index 0000000..ff0d27b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/zen/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm new file mode 100644 index 0000000..fc99627 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aors_n.asm @@ -0,0 +1,261 @@ +dnl AMD64 mpn_add_n, mpn_sub_n + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 1.5 with fluctuations +C AMD bd2 1.5 with fluctuations +C AMD bd3 +C AMD bd4 1.6 +C AMD zen +C AMD bt1 +C AMD bt2 +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL 1.21 +C Intel BWL 1.04 +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C INPUT PARAMETERS +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + + mov R32(n), R32(%rax) + shr $3, n + and $7, R32(%rax) + + lea L(tab)(%rip), %r9 + neg %r8 C set carry +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax C lea not add to preserve carry + jmp *%rax +',` + jmp *(%r9,%rax,8) +') +EPILOGUE() + + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + + mov R32(n), R32(%rax) + shr $3, n + and $7, R32(%rax) C clear cy as side-effect + + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax C lea not add to preserve carry + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(0): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + jmp L(e0) + +L(4): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + lea -32(up), up + lea -32(vp), vp + lea -32(rp), rp + inc n + jmp L(e4) + +L(5): mov (up), %r11 + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB (vp), %r11 + lea -24(up), up + lea -24(vp), vp + lea -24(rp), rp + inc n + jmp L(e5) + +L(6): mov (up), %r10 + ADCSBB (vp), %r10 + mov 8(up), %r11 + lea -16(up), up + lea -16(vp), vp + lea -16(rp), rp + inc n + jmp L(e6) + +L(7): mov (up), %r9 + mov 8(up), %r10 + ADCSBB (vp), %r9 + ADCSBB 8(vp), %r10 + lea -8(up), up + lea -8(vp), vp + lea -8(rp), rp + inc n + jmp L(e7) + + ALIGN(16) +L(top): +L(e3): mov %r9, 40(rp) +L(e2): mov %r10, 48(rp) +L(e1): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + mov %r11, 56(rp) + lea 64(rp), rp +L(e0): mov 16(up), %r10 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + mov %r8, (rp) +L(e7): mov 24(up), %r11 + mov %r9, 8(rp) +L(e6): mov 32(up), %r8 + mov 40(up), %r9 + ADCSBB 24(vp), %r11 + mov %r10, 16(rp) +L(e5): ADCSBB 32(vp), %r8 + mov %r11, 24(rp) +L(e4): mov 48(up), %r10 + mov 56(up), %r11 + mov %r8, 32(rp) + lea 64(up), up + ADCSBB 40(vp), %r9 + ADCSBB 48(vp), %r10 + ADCSBB 56(vp), %r11 + lea 64(vp), vp + dec n + jnz L(top) + +L(end): mov %r9, 40(rp) + mov %r10, 48(rp) + mov %r11, 56(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(3): mov (up), %r9 + mov 8(up), %r10 + mov 16(up), %r11 + ADCSBB (vp), %r9 + ADCSBB 8(vp), %r10 + ADCSBB 16(vp), %r11 + jrcxz L(x3) + lea 24(up), up + lea 24(vp), vp + lea -40(rp), rp + jmp L(e3) +L(x3): mov %r9, (rp) + mov %r10, 8(rp) + mov %r11, 16(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(1): mov (up), %r11 + ADCSBB (vp), %r11 + jrcxz L(x1) + lea 8(up), up + lea 8(vp), vp + lea -56(rp), rp + jmp L(e1) +L(x1): mov %r11, (rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(2): mov (up), %r10 + mov 8(up), %r11 + ADCSBB (vp), %r10 + ADCSBB 8(vp), %r11 + jrcxz L(x2) + lea 16(up), up + lea 16(vp), vp + lea -48(rp), rp + jmp L(e2) +L(x2): mov %r10, (rp) + mov %r11, 8(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm new file mode 100644 index 0000000..3f43afa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/aorsmul_1.asm @@ -0,0 +1,201 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core2 - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 2.32 +C Intel BWL 2.04 +C Intel SKL 1.95 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Handle small n separately, for lower overhead. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rbp') +define(`v0', `%rdx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`ADCSBB', `adc') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`ADCSBB', `sbb') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov n_param, n + mov v0_param, v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): shr $2, n + jc L(b10) + +L(b00): mulx( (up), %r13, %r12) + mulx( 8,(up), %rbx, %rax) + add %r12, %rbx + adc $0, %rax + mov (rp), %r12 + mov 8(rp), %rcx + mulx( 16,(up), %r9, %r8) + lea -16(rp), rp + lea 16(up), up + ADDSUB %r13, %r12 + jmp L(lo0) + +L(bx1): shr $2, n + jc L(b11) + +L(b01): mulx( (up), %r11, %r10) + jnz L(gt1) +L(n1): ADDSUB %r11, (rp) + mov $0, R32(%rax) + adc %r10, %rax + jmp L(ret) + +L(gt1): mulx( 8,(up), %r13, %r12) + mulx( 16,(up), %rbx, %rax) + lea 24(up), up + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (rp), %r10 + mov 8(rp), %r12 + mov 16(rp), %rcx + lea -8(rp), rp + ADDSUB %r11, %r10 + jmp L(lo1) + +L(b11): mulx( (up), %rbx, %rax) + mov (rp), %rcx + mulx( 8,(up), %r9, %r8) + lea 8(up), up + lea -24(rp), rp + inc n C adjust n + ADDSUB %rbx, %rcx + jmp L(lo3) + +L(b10): mulx( (up), %r9, %r8) + mulx( 8,(up), %r11, %r10) + lea -32(rp), rp + mov $0, R32(%rax) + clc C clear cf + jz L(end) C depends on old shift + + ALIGN(16) +L(top): adc %rax, %r9 + lea 32(rp), rp + adc %r8, %r11 + mulx( 16,(up), %r13, %r12) + mov (rp), %r8 + mulx( 24,(up), %rbx, %rax) + lea 32(up), up + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 8(rp), %r10 + mov 16(rp), %r12 + ADDSUB %r9, %r8 + mov 24(rp), %rcx + mov %r8, (rp) + ADCSBB %r11, %r10 +L(lo1): mulx( (up), %r9, %r8) + mov %r10, 8(rp) + ADCSBB %r13, %r12 +L(lo0): mov %r12, 16(rp) + ADCSBB %rbx, %rcx +L(lo3): mulx( 8,(up), %r11, %r10) + mov %rcx, 24(rp) + dec n + jnz L(top) + +L(end): adc %rax, %r9 + adc %r8, %r11 + mov 32(rp), %r8 + mov %r10, %rax + adc $0, %rax + mov 40(rp), %r10 + ADDSUB %r9, %r8 + mov %r8, 32(rp) + ADCSBB %r11, %r10 + mov %r10, 40(rp) + adc $0, %rax + +L(ret): pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm new file mode 100644 index 0000000..b5863b6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/gcd_22.asm @@ -0,0 +1,138 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, useful tzcnt, shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 6.7 +C AMD bt1 - +C AMD bt2 - +C AMD zn1 5.4 +C AMD zn2 5.5 +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL 7.1 +C Intel BWL 5.5 +C Intel SKL 5.6 +C Intel atom - +C Intel SLM - +C Intel GLM - +C Intel GLM+ - +C VIA nano - + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') +define(`cnt', `%rax') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + rep;bsf t0, cnt C tzcnt! + + mov u0, s0 + sub v0, u0 + mov u1, s1 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + xor R32(t0), R32(t0) + sub cnt, t0 + shlx( t0, u1, s1) + shrx( cnt, u0, u0) + shrx( cnt, u1, u1) + or s1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + rep;bsf t0, cnt C tzcnt! + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): mov v0, %rax + C mov v1, %rdx +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h new file mode 100644 index 0000000..c11aeec --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/gmp-mparam.h @@ -0,0 +1,253 @@ +/* Haswell gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600-4000 MHz Intel Xeon E3-1271v3 Haswell */ +/* FFT tuning limit = 467,964,359 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 26 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 9 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 25 + +#define DIV_1_VS_MUL_1_PERCENT 427 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 74 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 276 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 120 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 139 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 128 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 170 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 315 +#define SQR_TOOM6_THRESHOLD 414 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 42 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 376, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 79,11}, { 47,10}, { 95,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63, 8}, { 1023,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,10}, \ + { 895,11}, { 479,13}, { 127,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 735,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,11}, { 895,12}, { 479,11}, \ + { 959,14}, { 127,12}, { 543,11}, { 1087,12}, \ + { 607,11}, { 1215,10}, { 2431,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,13}, \ + { 511,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 959,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4863,16}, \ + { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 6911,16}, { 3583,15}, { 7679,14}, { 15359,15}, \ + { 7935,17}, { 2047,16}, { 4095,15}, { 8447,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 238 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 368, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ + { 319,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303, 9}, \ + { 607,11}, { 159,10}, { 319, 6}, { 5631, 7}, \ + { 2943, 6}, { 5887, 8}, { 1535,11}, { 207,10}, \ + { 415,11}, { 223,10}, { 447,11}, { 239,10}, \ + { 479,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671,11}, { 351,10}, { 703,11}, \ + { 367,10}, { 735,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,11}, { 447,10}, { 895,11}, \ + { 479,13}, { 127,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,12}, { 287,11}, { 607,10}, \ + { 1215,11}, { 671,12}, { 351,11}, { 735,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,11}, { 959,14}, \ + { 127,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,12}, { 607,11}, { 1215,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1151,13}, \ + { 639,12}, { 1279,13}, { 703,12}, { 1407,11}, \ + { 2815,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1727,11}, { 3455,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,14}, \ + { 895,13}, { 1791,12}, { 3583,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8191,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 237 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 68 +#define MULLO_MUL_N_THRESHOLD 8967 +#define SQRLO_BASECASE_THRESHOLD 11 +#define SQRLO_DC_THRESHOLD 80 +#define SQRLO_SQR_THRESHOLD 6481 + +#define DC_DIV_QR_THRESHOLD 58 +#define DC_DIVAPPR_Q_THRESHOLD 182 +#define DC_BDIV_QR_THRESHOLD 60 +#define DC_BDIV_Q_THRESHOLD 123 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 179 +#define INV_APPR_THRESHOLD 182 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 48 +#define REDC_2_TO_REDC_N_THRESHOLD 63 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 82 +#define MU_BDIV_QR_THRESHOLD 1334 +#define MU_BDIV_Q_THRESHOLD 1506 + +#define POWM_SEC_TABLE 1,22,194,473,1297,2698 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 1391 +#define SET_STR_PRECOMPUTE_THRESHOLD 2654 + +#define FAC_DSC_THRESHOLD 562 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD2_DIV1_METHOD 5 /* 3.49% faster than 3 */ +#define HGCD_THRESHOLD 96 +#define HGCD_APPR_THRESHOLD 92 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 501 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 1 /* 23.87% faster than 4 */ + +/* Tuneup completed successfully, took 238360 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm new file mode 100644 index 0000000..5e649e8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_1.asm @@ -0,0 +1,159 @@ +dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core2 - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 1.59 +C Intel BWL 1.76 +C Intel SKL 1.54 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rbp') +define(`v0', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + + mov n_param, n + shr $2, n + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): test $2, R8(n_param) + mov v0_param, v0 + jnz L(b10) + +L(b00): mulx( (up), %r9, %r8) + mulx( 8,(up), %r11, %r10) + mulx( 16,(up), %rcx, %r12) + lea -32(rp), rp + jmp L(lo0) + +L(b10): mulx( (up), %rcx, %r12) + mulx( 8,(up), %rbx, %rax) + lea -16(rp), rp + test n, n + jz L(cj2) + mulx( 16,(up), %r9, %r8) + lea 16(up), up + jmp L(lo2) + +L(bx1): test $2, R8(n_param) + mov v0_param, v0 + jnz L(b11) + +L(b01): mulx( (up), %rbx, %rax) + lea -24(rp), rp + test n, n + jz L(cj1) + mulx( 8,(up), %r9, %r8) + lea 8(up), up + jmp L(lo1) + +L(b11): mulx( (up), %r11, %r10) + mulx( 8,(up), %rcx, %r12) + mulx( 16,(up), %rbx, %rax) + lea -8(rp), rp + test n, n + jz L(cj3) + lea 24(up), up + jmp L(lo3) + + ALIGN(32) +L(top): lea 32(rp), rp + mov %r9, (rp) + adc %r8, %r11 +L(lo3): mulx( (up), %r9, %r8) + mov %r11, 8(rp) + adc %r10, %rcx +L(lo2): mov %rcx, 16(rp) + adc %r12, %rbx +L(lo1): mulx( 8,(up), %r11, %r10) + adc %rax, %r9 + mulx( 16,(up), %rcx, %r12) + mov %rbx, 24(rp) +L(lo0): mulx( 24,(up), %rbx, %rax) + lea 32(up), up + dec n + jnz L(top) + +L(end): lea 32(rp), rp + mov %r9, (rp) + adc %r8, %r11 +L(cj3): mov %r11, 8(rp) + adc %r10, %rcx +L(cj2): mov %rcx, 16(rp) + adc %r12, %rbx +L(cj1): mov %rbx, 24(rp) + adc $0, %rax + + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm new file mode 100644 index 0000000..f1f044f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_2.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_mul_2 optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bull - +C AMD pile - +C AMD steam - +C AMD excavator - +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel core - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL 3.74 +C Intel BWL 4.21 +C Intel SKL 4.20 +C Intel atom - +C Intel SLM - +C VIA nano - + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Move test and jcc together, for insn fusion. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), v0 + mov 8(vp), v1 + + lea 3(n_param), n + shr $2, n + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): xor w0, w0 + test $2, R8(n_param) + mov (up), %rdx + mulx( v0, w2, w1) + jz L(lo0) + +L(b10): lea -16(rp), rp + lea -16(up), up + jmp L(lo2) + +L(bx1): xor w2, w2 + test $2, R8(n_param) + mov (up), %rdx + mulx( v0, w0, w3) + jnz L(b11) + +L(b01): lea -24(rp), rp + lea 8(up), up + jmp L(lo1) + +L(b11): lea -8(rp), rp + lea -8(up), up + jmp L(lo3) + + ALIGN(16) +L(top): mulx( v1, %rax, w0) + add %rax, w2 C 0 + mov (up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 C 1 + add %rax, w2 C 0 + adc $0, w1 C 1 + add w3, w2 C 0 +L(lo0): mov w2, (rp) C 0 + adc $0, w1 C 1 + mulx( v1, %rax, w2) + add %rax, w0 C 1 + mov 8(up), %rdx + adc $0, w2 C 2 + mulx( v0, %rax, w3) + add %rax, w0 C 1 + adc $0, w3 C 2 + add w1, w0 C 1 +L(lo3): mov w0, 8(rp) C 1 + adc $0, w3 C 2 + mulx( v1, %rax, w0) + add %rax, w2 C 2 + mov 16(up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 C 3 + add %rax, w2 C 2 + adc $0, w1 C 3 + add w3, w2 C 2 +L(lo2): mov w2, 16(rp) C 2 + adc $0, w1 C 3 + mulx( v1, %rax, w2) + add %rax, w0 C 3 + mov 24(up), %rdx + adc $0, w2 C 4 + mulx( v0, %rax, w3) + add %rax, w0 C 3 + adc $0, w3 C 4 + add w1, w0 C 3 + lea 32(up), up +L(lo1): mov w0, 24(rp) C 3 + adc $0, w3 C 4 + dec n + lea 32(rp), rp + jnz L(top) + +L(end): mulx( v1, %rdx, %rax) + add %rdx, w2 + adc $0, %rax + add w3, w2 + mov w2, (rp) + adc $0, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm new file mode 100644 index 0000000..b2656c8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mul_basecase.asm @@ -0,0 +1,441 @@ +dnl AMD64 mpn_mul_basecase optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 n/a n/a - n/a +C AMD K10 n/a n/a - n/a +C AMD bull n/a n/a - n/a +C AMD pile n/a n/a - n/a +C AMD steam ? ? - ? +C AMD bobcat n/a n/a - n/a +C AMD jaguar ? ? - ? +C Intel P4 n/a n/a - n/a +C Intel core n/a n/a - n/a +C Intel NHM n/a n/a - n/a +C Intel SBR n/a n/a - n/a +C Intel IBR n/a n/a - n/a +C Intel HWL 1.77 1.86 - 2.15 +C Intel BWL ? ? - ? +C Intel atom n/a n/a - n/a +C VIA nano n/a n/a - n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Adjoin a mul_3. +C * Further micro-optimise. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + mov un_param, un C free up rdx + neg un + + mov un_param, n C FIXME: share + sar $2, n C FIXME: share + + test $1, R8(vn) + jz L(do_mul_2) + +define(`w4', `%r9') +define(`w5', `%r14') + + mov (vp), %rdx + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):test $2, R8(un) + jnz L(m110) + +L(m100): + mulx( (up), w5, w2) + mulx( 8,(up), w1, w3) + lea -24(rp), rp + jmp L(m1l0) + +L(m110): + mulx( (up), w3, w4) + mulx( 8,(up), w1, w5) + lea -8(rp), rp + test n, n + jz L(cj2) + mulx( 16,(up), w0, w2) + lea 16(up), up + jmp L(m1l2) + +L(m1x1):test $2, R8(un) + jz L(m111) + +L(m101): + mulx( (up), w4, w5) + lea -16(rp), rp + test n, n + jz L(cj1) + mulx( 8,(up), w0, w2) + lea 8(up), up + jmp L(m1l1) + +L(m111): + mulx( (up), w2, w3) + mulx( 8,(up), w0, w4) + mulx( 16,(up), w1, w5) + lea 24(up), up + test n, n + jnz L(gt3) + add w0, w3 + jmp L(cj3) +L(gt3): add w0, w3 + jmp L(m1l3) + + ALIGN(32) +L(m1tp):lea 32(rp), rp +L(m1l3):mov w2, (rp) + mulx( (up), w0, w2) +L(m1l2):mov w3, 8(rp) + adc w1, w4 +L(m1l1):adc w0, w5 + mov w4, 16(rp) + mulx( 8,(up), w1, w3) +L(m1l0):mov w5, 24(rp) + mulx( 16,(up), w0, w4) + adc w1, w2 + mulx( 24,(up), w1, w5) + adc w0, w3 + lea 32(up), up + dec n + jnz L(m1tp) + +L(m1ed):lea 32(rp), rp +L(cj3): mov w2, (rp) +L(cj2): mov w3, 8(rp) + adc w1, w4 +L(cj1): mov w4, 16(rp) + adc $0, w5 + mov w5, 24(rp) + + dec R32(vn) + jz L(ret5) + + lea 8(vp), vp + lea 32(rp), rp +C push %r12 +C push %r13 +C push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') +C push %r12 +C push %r13 +C push %r14 + + mov (vp), v0 + mov 8(vp), v1 + + lea (un), n + sar $2, n + + test $1, R8(un) + jnz L(m2x1) + +L(m2x0):xor w0, w0 + test $2, R8(un) + mov (up), %rdx + mulx( v0, w2, w1) + jz L(m2l0) + +L(m210):lea -16(rp), rp + lea -16(up), up + jmp L(m2l2) + +L(m2x1):xor w2, w2 + test $2, R8(un) + mov (up), %rdx + mulx( v0, w0, w3) + jz L(m211) + +L(m201):lea -24(rp), rp + lea 8(up), up + jmp L(m2l1) + +L(m211):lea -8(rp), rp + lea -8(up), up + jmp L(m2l3) + + ALIGN(16) +L(m2tp):mulx( v1, %rax, w0) + add %rax, w2 + mov (up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 + add %rax, w2 + adc $0, w1 + add w3, w2 +L(m2l0):mov w2, (rp) + adc $0, w1 + mulx( v1, %rax, w2) + add %rax, w0 + mov 8(up), %rdx + adc $0, w2 + mulx( v0, %rax, w3) + add %rax, w0 + adc $0, w3 + add w1, w0 +L(m2l3):mov w0, 8(rp) + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, w2 + mov 16(up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 + add %rax, w2 + adc $0, w1 + add w3, w2 +L(m2l2):mov w2, 16(rp) + adc $0, w1 + mulx( v1, %rax, w2) + add %rax, w0 + mov 24(up), %rdx + adc $0, w2 + mulx( v0, %rax, w3) + add %rax, w0 + adc $0, w3 + add w1, w0 + lea 32(up), up +L(m2l1):mov w0, 24(rp) + adc $0, w3 + inc n + lea 32(rp), rp + jnz L(m2tp) + +L(m2ed):mulx( v1, %rdx, %rax) + add %rdx, w2 + adc $0, %rax + add w3, w2 + mov w2, (rp) + adc $0, %rax + mov %rax, 8(rp) + + add $-2, R32(vn) + jz L(ret5) + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + + lea (rp,un,8), rp + lea (up,un,8), up + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + + lea 2(un), n + sar $2, n + + mov (up), %rdx + test $1, R8(un) + jnz L(bx1) + +L(bx0): mov (rp), X0 + mov 8(rp), X1 + mulx( v0, %rax, w1) + add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + mov X0, (rp) + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + test $2, R8(un) + jnz L(b10) + +L(b00): lea 16(up), up + lea 16(rp), rp + jmp L(lo0) + +L(b10): mov 16(rp), X0 + lea 32(up), up + mulx( v0, %rax, w3) + jmp L(lo2) + +L(bx1): mov (rp), X1 + mov 8(rp), X0 + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + mov 8(up), %rdx + mov X1, (rp) + mulx( v0, %rax, w1) + test $2, R8(un) + jz L(b11) + +L(b01): mov 16(rp), X1 + lea 24(rp), rp + lea 24(up), up + jmp L(lo1) + +L(b11): lea 8(rp), rp + lea 8(up), up + jmp L(lo3) + + ALIGN(16) +L(top): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(lo2): add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + lea 32(rp), rp + add w1, X1 + mov -16(up), %rdx + mov X1, -24(rp) + adc $0, w3 + add w2, X0 + mov -8(rp), X1 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo1): add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + add w3, X0 + mov X0, -16(rp) + adc $0, w1 + add %rax, X1 + adc $0, w2 + add w0, X1 + mov -8(up), %rdx + adc $0, w2 +L(lo0): mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mov (rp), X0 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + add w1, X1 + mov X1, -8(rp) + adc $0, w3 + mov (up), %rdx + add w2, X0 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + add w3, X0 + mov 8(rp), X1 + mov X0, (rp) + mov 16(rp), X0 + adc $0, w1 + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + lea 32(up), up + inc n + jnz L(top) + +L(end): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 + add %rax, X1 + adc $0, w3 + mulx( v1, %rdx, %rax) + add w1, X1 + mov X1, 8(rp) + adc $0, w3 + add w2, %rdx + adc $0, %rax + add w3, %rdx + mov %rdx, 16(rp) + adc $0, %rax + mov %rax, 24(rp) + + addl $-2, vn + lea 16(vp), vp + lea -16(up,un,8), up + lea 32(rp,un,8), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 +L(ret4):pop %r13 +L(ret3):pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm new file mode 100644 index 0000000..e65559b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/mullo_basecase.asm @@ -0,0 +1,422 @@ +dnl AMD64 mpn_mullo_basecase optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 +C AMD K8,K9 n/a n/a +C AMD K10 n/a n/a +C AMD bull n/a n/a +C AMD pile n/a n/a +C AMD steam ? ? +C AMD bobcat n/a n/a +C AMD jaguar ? ? +C Intel P4 n/a n/a +C Intel core n/a n/a +C Intel NHM n/a n/a +C Intel SBR n/a n/a +C Intel IBR n/a n/a +C Intel HWL 1.86 2.15 +C Intel BWL ? ? +C Intel atom n/a n/a +C VIA nano n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Implement proper cor2, replacing current cor0. +C * Micro-optimise. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r8') +define(`X0', `%r14') +define(`X1', `%r15') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`i', `%rbp') +define(`v0', `%r9') +define(`v1', `%rbx') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + + mov vp_param, vp + mov (up), %rdx + + cmp $4, n + jb L(small) + + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + lea 2(n), i + shr $2, i + neg n + add $2, n + + push up C put entry `up' on stack + + test $1, R8(n) + jnz L(m2x1) + +L(m2x0):mulx( v0, w0, w3) + xor R32(w2), R32(w2) + test $2, R8(n) + jz L(m2b2) + +L(m2b0):lea -8(rp), rp + lea -8(up), up + jmp L(m2e0) + +L(m2b2):lea -24(rp), rp + lea 8(up), up + jmp L(m2e2) + +L(m2x1):mulx( v0, w2, w1) + xor R32(w0), R32(w0) + test $2, R8(n) + jnz L(m2b3) + +L(m2b1):jmp L(m2e1) + +L(m2b3):lea -16(rp), rp + lea -16(up), up + jmp L(m2e3) + + ALIGN(16) +L(m2tp):mulx( v1, %rax, w0) + add %rax, w2 + mov (up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 + add %rax, w2 + adc $0, w1 + add w3, w2 +L(m2e1):mov w2, (rp) + adc $0, w1 + mulx( v1, %rax, w2) + add %rax, w0 + mov 8(up), %rdx + adc $0, w2 + mulx( v0, %rax, w3) + add %rax, w0 + adc $0, w3 + add w1, w0 +L(m2e0):mov w0, 8(rp) + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, w2 + mov 16(up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 + add %rax, w2 + adc $0, w1 + add w3, w2 +L(m2e3):mov w2, 16(rp) + adc $0, w1 + mulx( v1, %rax, w2) + add %rax, w0 + mov 24(up), %rdx + adc $0, w2 + mulx( v0, %rax, w3) + add %rax, w0 + adc $0, w3 + add w1, w0 + lea 32(up), up +L(m2e2):mov w0, 24(rp) + adc $0, w3 + dec i + lea 32(rp), rp + jnz L(m2tp) + +L(m2ed):mulx( v1, %rax, w0) + add %rax, w2 + mov (up), %rdx + mulx( v0, %rax, w1) + add w2, %rax + add w3, %rax + mov %rax, (rp) + + mov (%rsp), up C restore `up' to beginning + lea 16(vp), vp + lea 8(rp,n,8), rp C put back rp to old rp + 2 + add $2, n + jge L(cor1) + + push %r14 + push %r15 + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + + lea (n), i + sar $2, i + + mov (up), %rdx + test $1, R8(n) + jnz L(bx1) + +L(bx0): mov (rp), X1 + mov 8(rp), X0 + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + mov 8(up), %rdx + mov X1, (rp) + mulx( v0, %rax, w1) + test $2, R8(n) + jz L(b2) + +L(b0): lea 8(rp), rp + lea 8(up), up + jmp L(lo0) + +L(b2): mov 16(rp), X1 + lea 24(rp), rp + lea 24(up), up + jmp L(lo2) + +L(bx1): mov (rp), X0 + mov 8(rp), X1 + mulx( v0, %rax, w1) + add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + mov X0, (rp) + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + test $2, R8(n) + jnz L(b3) + +L(b1): lea 16(up), up + lea 16(rp), rp + jmp L(lo1) + +L(b3): mov 16(rp), X0 + lea 32(up), up + mulx( v0, %rax, w3) + inc i + jz L(cj3) + jmp L(lo3) + + ALIGN(16) +L(top): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(lo3): add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + lea 32(rp), rp + add w1, X1 + mov -16(up), %rdx + mov X1, -24(rp) + adc $0, w3 + add w2, X0 + mov -8(rp), X1 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo2): add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + add w3, X0 + mov X0, -16(rp) + adc $0, w1 + add %rax, X1 + adc $0, w2 + add w0, X1 + mov -8(up), %rdx + adc $0, w2 +L(lo1): mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mov (rp), X0 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + add w1, X1 + mov X1, -8(rp) + adc $0, w3 + mov (up), %rdx + add w2, X0 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo0): add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + add w3, X0 + mov 8(rp), X1 + mov X0, (rp) + mov 16(rp), X0 + adc $0, w1 + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + lea 32(up), up + inc i + jnz L(top) + +L(end): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(cj3): add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + add w1, X1 + mov -16(up), %rdx + mov X1, 8(rp) + adc $0, w3 + add w2, X0 + mulx( v0, %rax, w1) + add X0, %rax + add w3, %rax + mov %rax, 16(rp) + + mov 16(%rsp), up C restore `up' to beginning + lea 16(vp), vp + lea 24(rp,n,8), rp C put back rp to old rp + 2 + add $2, n + jl L(outer) + + pop %r15 + pop %r14 + + jnz L(cor0) + +L(cor1):mov (vp), v0 + mov 8(vp), v1 + mov (up), %rdx + mulx( v0, %r12, %rbp) C u0 x v2 + add (rp), %r12 C FIXME: rp[0] still available in reg? + adc %rax, %rbp + mov 8(up), %r10 + imul v0, %r10 + imul v1, %rdx + mov %r12, (rp) + add %r10, %rdx + add %rbp, %rdx + mov %rdx, 8(rp) + pop %rax C deallocate `up' copy + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(cor0):mov (vp), %r11 + imul (up), %r11 + add %rax, %r11 + mov %r11, (rp) + pop %rax C deallocate `up' copy + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + cmp $2, n + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r11 + mov (up), %rdx + mulx( %r11, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r11 C u1 x v1 + add %r11, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm new file mode 100644 index 0000000..b1d6c0a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/redc_1.asm @@ -0,0 +1,437 @@ +dnl AMD64 mpn_redc_1 optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bull n/a +C AMD pile n/a +C AMD steam ? +C AMD bobcat n/a +C AMD jaguar ? +C Intel P4 n/a +C Intel core n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL 2.32 +C Intel BWL ? +C Intel atom n/a +C VIA nano n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise. +C * Consider inlining mpn_add_n. Tests indicate that this saves just 1-2 +C cycles, though. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv_param', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%rdi') +define(`u0inv', `(%rsp)') C stack + +ABI_SUPPORT(DOS64) C FIXME: needs verification +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + push rp + mov mp_param, mp C note that rp and mp shares register + mov (up), %rdx + + neg n + push %r8 C put u0inv on stack + imul u0inv_param, %rdx C first iteration q0 + mov n, j C outer loop induction var + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jz L(o0b) + + cmp $-2, R32(n) + jnz L(o2) + +C Special code for n = 2 since general code cannot handle it + mov 8(%rsp), %rbx C rp + lea 16(%rsp), %rsp C deallocate two slots + mulx( (mp), %r9, %r12) + mulx( 8,(mp), %r11, %r10) + add %r12, %r11 + adc $0, %r10 + add (up), %r9 C = 0 + adc 8(up), %r11 C r11 = up[1] + adc $0, %r10 C -> up[0] + mov %r11, %rdx + imul u0inv_param, %rdx + mulx( (mp), %r13, %r12) + mulx( 8,(mp), %r14, %r15) + xor R32(%rax), R32(%rax) + add %r12, %r14 + adc $0, %r15 + add %r11, %r13 C = 0 + adc 16(up), %r14 C rp[2] + adc $0, %r15 C -> up[1] + add %r14, %r10 + adc 24(up), %r15 + mov %r10, (%rbx) + mov %r15, 8(%rbx) + setc R8(%rax) + jmp L(ret) + +L(o2): lea 2(n), i C inner loop induction var + mulx( (mp), %r9, %r8) + mulx( 8,(mp), %r11, %r10) + sar $2, i + add %r8, %r11 + jmp L(lo2) + + ALIGN(16) +L(tp2): adc %rax, %r9 + lea 32(up), up + adc %r8, %r11 +L(lo2): mulx( 16,(mp), %r13, %r12) + mov (up), %r8 + mulx( 24,(mp), %rbx, %rax) + lea 32(mp), mp + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 8(up), %r10 + mov 16(up), %r12 + add %r9, %r8 + mov 24(up), %rbp + mov %r8, (up) + adc %r11, %r10 + mulx( (mp), %r9, %r8) + mov %r10, 8(up) + adc %r13, %r12 + mov %r12, 16(up) + adc %rbx, %rbp + mulx( 8,(mp), %r11, %r10) + mov %rbp, 24(up) + inc i + jnz L(tp2) + +L(ed2): mov 56(up,n,8), %rdx C next iteration up[0] + lea 16(mp,n,8), mp C mp = (last starting mp) + adc %rax, %r9 + adc %r8, %r11 + mov 32(up), %r8 + adc $0, %r10 + imul u0inv, %rdx C next iteration q0 + mov 40(up), %rax + add %r9, %r8 + mov %r8, 32(up) + adc %r11, %rax + mov %rax, 40(up) + lea 56(up,n,8), up C up = (last starting up) + 1 + adc $0, %r10 + mov %r10, -8(up) + inc j + jnz L(o2) + + jmp L(cj) + + +L(bx1): test $2, R8(n) + jz L(o3a) + +L(o1a): cmp $-1, R32(n) + jnz L(o1b) + +C Special code for n = 1 since general code cannot handle it + mov 8(%rsp), %rbx C rp + lea 16(%rsp), %rsp C deallocate two slots + mulx( (mp), %r11, %r10) + add (up), %r11 + adc 8(up), %r10 + mov %r10, (%rbx) + mov $0, R32(%rax) + setc R8(%rax) + jmp L(ret) + +L(o1b): lea 24(mp), mp +L(o1): lea 1(n), i C inner loop induction var + mulx( -24,(mp), %r11, %r10) + mulx( -16,(mp), %r13, %r12) + mulx( -8,(mp), %rbx, %rax) + sar $2, i + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (up), %r10 + mov 8(up), %r12 + mov 16(up), %rbp + add %r11, %r10 + jmp L(lo1) + + ALIGN(16) +L(tp1): adc %rax, %r9 + lea 32(up), up + adc %r8, %r11 + mulx( 16,(mp), %r13, %r12) + mov -8(up), %r8 + mulx( 24,(mp), %rbx, %rax) + lea 32(mp), mp + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (up), %r10 + mov 8(up), %r12 + add %r9, %r8 + mov 16(up), %rbp + mov %r8, -8(up) + adc %r11, %r10 +L(lo1): mulx( (mp), %r9, %r8) + mov %r10, (up) + adc %r13, %r12 + mov %r12, 8(up) + adc %rbx, %rbp + mulx( 8,(mp), %r11, %r10) + mov %rbp, 16(up) + inc i + jnz L(tp1) + +L(ed1): mov 48(up,n,8), %rdx C next iteration up[0] + lea 40(mp,n,8), mp C mp = (last starting mp) + adc %rax, %r9 + adc %r8, %r11 + mov 24(up), %r8 + adc $0, %r10 + imul u0inv, %rdx C next iteration q0 + mov 32(up), %rax + add %r9, %r8 + mov %r8, 24(up) + adc %r11, %rax + mov %rax, 32(up) + lea 48(up,n,8), up C up = (last starting up) + 1 + adc $0, %r10 + mov %r10, -8(up) + inc j + jnz L(o1) + + jmp L(cj) + +L(o3a): cmp $-3, R32(n) + jnz L(o3b) + +C Special code for n = 3 since general code cannot handle it +L(n3): mulx( (mp), %rbx, %rax) + mulx( 8,(mp), %r9, %r14) + add (up), %rbx + mulx( 16,(mp), %r11, %r10) + adc %rax, %r9 C W 1 + adc %r14, %r11 C W 2 + mov 8(up), %r14 + mov u0inv_param, %rdx + adc $0, %r10 C W 3 + mov 16(up), %rax + add %r9, %r14 C W 1 + mov %r14, 8(up) + mulx( %r14, %rdx, %r13) C next iteration q0 + adc %r11, %rax C W 2 + mov %rax, 16(up) + adc $0, %r10 C W 3 + mov %r10, (up) + lea 8(up), up C up = (last starting up) + 1 + inc j + jnz L(n3) + + jmp L(cj) + +L(o3b): lea 8(mp), mp +L(o3): lea 4(n), i C inner loop induction var + mulx( -8,(mp), %rbx, %rax) + mulx( (mp), %r9, %r8) + mov (up), %rbp + mulx( 8,(mp), %r11, %r10) + sar $2, i + add %rbx, %rbp + nop + adc %rax, %r9 + jmp L(lo3) + + ALIGN(16) +L(tp3): adc %rax, %r9 + lea 32(up), up +L(lo3): adc %r8, %r11 + mulx( 16,(mp), %r13, %r12) + mov 8(up), %r8 + mulx( 24,(mp), %rbx, %rax) + lea 32(mp), mp + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 16(up), %r10 + mov 24(up), %r12 + add %r9, %r8 + mov 32(up), %rbp + mov %r8, 8(up) + adc %r11, %r10 + mulx( (mp), %r9, %r8) + mov %r10, 16(up) + adc %r13, %r12 + mov %r12, 24(up) + adc %rbx, %rbp + mulx( 8,(mp), %r11, %r10) + mov %rbp, 32(up) + inc i + jnz L(tp3) + +L(ed3): mov 64(up,n,8), %rdx C next iteration up[0] + lea 24(mp,n,8), mp C mp = (last starting mp) + adc %rax, %r9 + adc %r8, %r11 + mov 40(up), %r8 + adc $0, %r10 + imul u0inv, %rdx C next iteration q0 + mov 48(up), %rax + add %r9, %r8 + mov %r8, 40(up) + adc %r11, %rax + mov %rax, 48(up) + lea 64(up,n,8), up C up = (last starting up) + 1 + adc $0, %r10 + mov %r10, -8(up) + inc j + jnz L(o3) + + jmp L(cj) + +L(o0b): lea 16(mp), mp +L(o0): mov n, i C inner loop induction var + mulx( -16,(mp), %r13, %r12) + mulx( -8,(mp), %rbx, %rax) + sar $2, i + add %r12, %rbx + adc $0, %rax + mov (up), %r12 + mov 8(up), %rbp + mulx( (mp), %r9, %r8) + add %r13, %r12 + jmp L(lo0) + + ALIGN(16) +L(tp0): adc %rax, %r9 + lea 32(up), up + adc %r8, %r11 + mulx( 16,(mp), %r13, %r12) + mov -16(up), %r8 + mulx( 24,(mp), %rbx, %rax) + lea 32(mp), mp + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov -8(up), %r10 + mov (up), %r12 + add %r9, %r8 + mov 8(up), %rbp + mov %r8, -16(up) + adc %r11, %r10 + mulx( (mp), %r9, %r8) + mov %r10, -8(up) + adc %r13, %r12 + mov %r12, (up) +L(lo0): adc %rbx, %rbp + mulx( 8,(mp), %r11, %r10) + mov %rbp, 8(up) + inc i + jnz L(tp0) + +L(ed0): mov 40(up,n,8), %rdx C next iteration up[0] + lea 32(mp,n,8), mp C mp = (last starting mp) + adc %rax, %r9 + adc %r8, %r11 + mov 16(up), %r8 + adc $0, %r10 + imul u0inv, %rdx C next iteration q0 + mov 24(up), %rax + add %r9, %r8 + mov %r8, 16(up) + adc %r11, %rax + mov %rax, 24(up) + lea 40(up,n,8), up C up = (last starting up) + 1 + adc $0, %r10 + mov %r10, -8(up) + inc j + jnz L(o0) + +L(cj): +IFSTD(` mov 8(%rsp), %rdi C param 1: rp + lea 16-8(%rsp), %rsp C deallocate 2, add back for alignment + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` mov up, %rdx C param 2: up + lea (up,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov 8(%rsp), %rcx C param 1: rp + lea 16-32-8(%rsp), %rsp') C deallocate 2, allocate shadow, align + + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) + +IFSTD(` lea 8(%rsp), %rsp ') +IFDOS(` lea 32+8(%rsp), %rsp') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm new file mode 100644 index 0000000..641cdf3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreihwl/sqr_basecase.asm @@ -0,0 +1,506 @@ +dnl AMD64 mpn_sqr_basecase optimised for Intel Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 +C AMD K8,K9 n/a n/a n/a +C AMD K10 n/a n/a n/a +C AMD bull n/a n/a n/a +C AMD pile n/a n/a n/a +C AMD steam ? ? ? +C AMD bobcat n/a n/a n/a +C AMD jaguar ? ? ? +C Intel P4 n/a n/a n/a +C Intel core n/a n/a n/a +C Intel NHM n/a n/a n/a +C Intel SBR n/a n/a n/a +C Intel IBR n/a n/a n/a +C Intel HWL 1.86 2.15 ~2.5 +C Intel BWL ? ? ? +C Intel atom n/a n/a n/a +C VIA nano n/a n/a n/a + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund, except +C that the sqr_diag_addlsh1 loop was manually written. + +C TODO +C * Replace current unoptimised sqr_diag_addlsh1 loop; 1.75 c/l might be +C possible. +C * Consider splitting outer loop into 2, one for n = 1 (mod 2) and one for +C n = 0 (mod 2). These loops could fall into specific "corner" code. +C * Consider splitting outer loop into 4. +C * Streamline pointer updates. +C * Perhaps suppress a few more xor insns in feed-in code. +C * Make sure we write no dead registers in feed-in code. +C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch +C out for negative sizes being zero-extended, though. +C * Provide straight-line code for n = 4; then look for simplifications in +C main code. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, un_param + jae L(gt1) + + mov (up), %rdx + mulx( %rdx, %rax, %rdx) + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rdx + mov 8(up), %rcx + mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 + mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 + mov %rcx, %rdx + mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 + add %r9, %r9 C W 1 + adc %r10, %r10 C W 2 + adc $0, %rdx C W 3 + add %r9, %r8 C W 1 + adc %r11, %r10 C W 2 + adc $0, %rdx C W 3 + mov %rax, (rp) + mov %r8, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, un_param + jae L(gt3) +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w2', `%r11') + + mov (up), v0 + mov 8(up), %rdx + mov %rdx, v1 + mulx( v0, w2, %rax) + mov 16(up), %rdx + mulx( v0, w0, %rcx) + mov w2, %r8 + add %rax, w0 + adc $0, %rcx + mulx( v1, %rdx, %rax) + add %rcx, %rdx + mov %rdx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + xor R32(%rcx), R32(%rcx) + mov (up), %rdx + mulx( %rdx, %rax, w2) + mov %rax, (rp) + add %r8, %r8 + adc w0, w0 + setc R8(%rcx) + mov 8(up), %rdx + mulx( %rdx, %rax, %rdx) + add w2, %r8 + adc %rax, w0 + mov %r8, 8(rp) + mov w0, 16(rp) + mov 24(rp), %r8 + mov 32(rp), w0 + lea (%rdx,%rcx), w2 + adc %r8, %r8 + adc w0, w0 + setc R8(%rcx) + mov 16(up), %rdx + mulx( %rdx, %rax, %rdx) + add w2, %r8 + adc %rax, w0 + mov %r8, 24(rp) + mov w0, 32(rp) + adc %rcx, %rdx + mov %rdx, 40(rp) + FUNC_EXIT() + ret + +L(gt3): + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%rbx') +define(`w3', `%rbp') +define(`un', `%r12') +define(`n', `%rcx') + +define(`X0', `%r13') +define(`X1', `%r14') + +L(do_mul_2): + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + mov $0, R32(un) + sub un_param, un C free up rdx + push un + mov (up), v0 + mov 8(up), %rdx + lea 2(un), n + sar $2, n C FIXME: suppress, change loop? + inc un C decrement |un| + mov %rdx, v1 + + test $1, R8(un) + jnz L(mx1) + +L(mx0): mulx( v0, w2, w1) + mov 16(up), %rdx + mov w2, 8(rp) + xor w2, w2 + mulx( v0, w0, w3) + test $2, R8(un) + jz L(m00) + +L(m10): lea -8(rp), rp + lea -8(up), up + jmp L(mlo2) + +L(m00): lea 8(up), up + lea 8(rp), rp + jmp L(mlo0) + +L(mx1): mulx( v0, w0, w3) + mov 16(up), %rdx + mov w0, 8(rp) + xor w0, w0 + mulx( v0, w2, w1) + test $2, R8(un) + jz L(mlo3) + +L(m01): lea 16(rp), rp + lea 16(up), up + jmp L(mlo1) + + ALIGN(32) +L(mtop):mulx( v1, %rax, w0) + add %rax, w2 C 0 + mov (up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 C 1 + add %rax, w2 C 0 +L(mlo1):adc $0, w1 C 1 + add w3, w2 C 0 + mov w2, (rp) C 0 + adc $0, w1 C 1 + mulx( v1, %rax, w2) + add %rax, w0 C 1 + mov 8(up), %rdx + adc $0, w2 C 2 + mulx( v0, %rax, w3) + add %rax, w0 C 1 + adc $0, w3 C 2 +L(mlo0):add w1, w0 C 1 + mov w0, 8(rp) C 1 + adc $0, w3 C 2 + mulx( v1, %rax, w0) + add %rax, w2 C 2 + mov 16(up), %rdx + mulx( v0, %rax, w1) + adc $0, w0 C 3 + add %rax, w2 C 2 + adc $0, w1 C 3 +L(mlo3):add w3, w2 C 2 + mov w2, 16(rp) C 2 + adc $0, w1 C 3 + mulx( v1, %rax, w2) + add %rax, w0 C 3 + mov 24(up), %rdx + adc $0, w2 C 4 + mulx( v0, %rax, w3) + add %rax, w0 C 3 + adc $0, w3 C 4 +L(mlo2):add w1, w0 C 3 + lea 32(up), up + mov w0, 24(rp) C 3 + adc $0, w3 C 4 + inc n + lea 32(rp), rp + jnz L(mtop) + +L(mend):mulx( v1, %rdx, %rax) + add %rdx, w2 + adc $0, %rax + add w3, w2 + mov w2, (rp) + adc $0, %rax + mov %rax, 8(rp) + + lea 16(up), up + lea -16(rp), rp + +L(do_addmul_2): +L(outer): + lea (up,un,8), up C put back up to 2 positions above last time + lea 48(rp,un,8), rp C put back rp to 4 positions above last time + + mov -8(up), v0 C shared between addmul_2 and corner + + add $2, un C decrease |un| + cmp $-2, un + jge L(corner) + + mov (up), v1 + + lea 1(un), n + sar $2, n C FIXME: suppress, change loop? + + mov v1, %rdx + test $1, R8(un) + jnz L(bx1) + +L(bx0): mov (rp), X0 + mov 8(rp), X1 + mulx( v0, %rax, w1) + add %rax, X0 + adc $0, w1 + mov X0, (rp) + xor w2, w2 + test $2, R8(un) + jnz L(b10) + +L(b00): mov 8(up), %rdx + lea 16(rp), rp + lea 16(up), up + jmp L(lo0) + +L(b10): mov 8(up), %rdx + mov 16(rp), X0 + lea 32(up), up + inc n + mulx( v0, %rax, w3) + jz L(ex) + jmp L(lo2) + +L(bx1): mov (rp), X1 + mov 8(rp), X0 + mulx( v0, %rax, w3) + mov 8(up), %rdx + add %rax, X1 + adc $0, w3 + xor w0, w0 + mov X1, (rp) + mulx( v0, %rax, w1) + test $2, R8(un) + jz L(b11) + +L(b01): mov 16(rp), X1 + lea 24(rp), rp + lea 24(up), up + jmp L(lo1) + +L(b11): lea 8(rp), rp + lea 8(up), up + jmp L(lo3) + + ALIGN(32) +L(top): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(lo2): add %rax, X1 + adc $0, w3 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + lea 32(rp), rp + add w1, X1 + mov -16(up), %rdx + mov X1, -24(rp) + adc $0, w3 + add w2, X0 + mov -8(rp), X1 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo1): add %rax, X0 + mulx( v1, %rax, w2) + adc $0, w1 + add w3, X0 + mov X0, -16(rp) + adc $0, w1 + add %rax, X1 + adc $0, w2 + add w0, X1 + mov -8(up), %rdx + adc $0, w2 +L(lo0): mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mov (rp), X0 + mulx( v1, %rax, w0) + add %rax, X0 + adc $0, w0 + add w1, X1 + mov X1, -8(rp) + adc $0, w3 + mov (up), %rdx + add w2, X0 + mulx( v0, %rax, w1) + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mulx( v1, %rax, w2) + add w3, X0 + mov 8(rp), X1 + mov X0, (rp) + mov 16(rp), X0 + adc $0, w1 + add %rax, X1 + adc $0, w2 + mov 8(up), %rdx + lea 32(up), up + inc n + jnz L(top) + +L(end): mulx( v0, %rax, w3) + add w0, X1 + adc $0, w2 +L(ex): add %rax, X1 + adc $0, w3 + mulx( v1, %rdx, %rax) + add w1, X1 + mov X1, 8(rp) + adc $0, w3 + add w2, %rdx + adc $0, %rax + add %rdx, w3 + mov w3, 16(rp) + adc $0, %rax + mov %rax, 24(rp) + + jmp L(outer) C loop until a small corner remains + +L(corner): + pop un + mov (up), %rdx + jg L(small_corner) + + mov %rdx, v1 + mov (rp), X0 + mov %rax, X1 C Tricky rax reuse of last iteration + mulx( v0, %rax, w1) + add %rax, X0 + adc $0, w1 + mov X0, (rp) + mov 8(up), %rdx + mulx( v0, %rax, w3) + add %rax, X1 + adc $0, w3 + mulx( v1, %rdx, %rax) + add w1, X1 + mov X1, 8(rp) + adc $0, w3 + add w3, %rdx + mov %rdx, 16(rp) + adc $0, %rax + mov %rax, 24(rp) + lea 32(rp), rp + lea 16(up), up + jmp L(com) + +L(small_corner): + mulx( v0, X1, w3) + add %rax, X1 C Tricky rax reuse of last iteration + adc $0, w3 + mov X1, (rp) + mov w3, 8(rp) + lea 16(rp), rp + lea 8(up), up + +L(com): + +L(sqr_diag_addlsh1): + lea 8(up,un,8), up C put back up at its very beginning + lea (rp,un,8), rp + lea (rp,un,8), rp C put back rp at its very beginning + inc un + + mov -8(up), %rdx + xor R32(%rbx), R32(%rbx) C clear CF as side effect + mulx( %rdx, %rax, %r10) + mov %rax, 8(rp) + mov 16(rp), %r8 + mov 24(rp), %r9 + jmp L(dm) + + ALIGN(16) +L(dtop):mov 32(rp), %r8 + mov 40(rp), %r9 + lea 16(rp), rp + lea (%rdx,%rbx), %r10 +L(dm): adc %r8, %r8 + adc %r9, %r9 + setc R8(%rbx) + mov (up), %rdx + lea 8(up), up + mulx( %rdx, %rax, %rdx) + add %r10, %r8 + adc %rax, %r9 + mov %r8, 16(rp) + mov %r9, 24(rp) + inc un + jnz L(dtop) + +L(dend):adc %rbx, %rdx + mov %rdx, 32(rp) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm new file mode 100644 index 0000000..eed64e7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/aorrlsh_n.asm @@ -0,0 +1,200 @@ +dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k) +dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[] +dnl Optimised for Nehalem. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 4.75 +C Intel P4 ? +C Intel core2 2.8-3 +C Intel NHM 2.8 +C Intel SBR 3.55 +C Intel atom ? +C VIA nano ? + +C The inner-loop probably runs close to optimally on Nehalem (using 4-way +C unrolling). The rest of the code is quite crude, and could perhaps be made +C both smaller and faster. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') +define(`cy', `%r9') C for _nc variant + +ifdef(`OPERATION_addlsh_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(IFRSB, ) + define(func_n, mpn_addlsh_n) + define(func_nc, mpn_addlsh_nc)') +ifdef(`OPERATION_rsblsh_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(IFRSB, `$1') + define(func_n, mpn_rsblsh_n) + define(func_nc, mpn_rsblsh_nc)') + +C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with +C refmpn_rsblsh_nc +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt + push %rbx + xor R32(%rbx), R32(%rbx) C clear CF save register +L(ent): push %rbp + mov R32(n), R32(%rbp) + mov n, %rax + + mov R32(cnt), R32(%rcx) + neg R32(%rcx) + + lea -8(up,%rax,8), up + lea -8(vp,%rax,8), vp + lea -40(rp,%rax,8), rp + neg %rax + + and $3, R32(%rbp) + jz L(b0) + cmp $2, R32(%rbp) + jc L(b1) + jz L(b2) + +L(b3): xor R32(%r9), R32(%r9) + mov 8(vp,%rax,8), %r10 + mov 16(vp,%rax,8), %r11 + shrd %cl, %r10, %r9 + shrd %cl, %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB 8(up,%rax,8), %r9 + mov 24(vp,%rax,8), %r8 + ADCSBB 16(up,%rax,8), %r10 + sbb R32(%rbx), R32(%rbx) + add $3, %rax + jmp L(lo3) + +L(b0): mov 8(vp,%rax,8), %r9 + xor R32(%r8), R32(%r8) + shrd %cl, %r9, %r8 + mov 16(vp,%rax,8), %r10 + mov 24(vp,%rax,8), %r11 + shrd %cl, %r10, %r9 + shrd %cl, %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB 8(up,%rax,8), %r8 + mov %r8, 40(rp,%rax,8) C offset 40 + ADCSBB 16(up,%rax,8), %r9 + mov 32(vp,%rax,8), %r8 + ADCSBB 24(up,%rax,8), %r10 + sbb R32(%rbx), R32(%rbx) + add $4, %rax + jmp L(lo0) + +L(b1): mov 8(vp,%rax,8), %r8 + add $1, %rax + jz L(1) + mov 8(vp,%rax,8), %r9 + xor R32(%rbp), R32(%rbp) + jmp L(lo1) +L(1): xor R32(%r11), R32(%r11) + jmp L(wd1) + +L(b2): xor %r10, %r10 + mov 8(vp,%rax,8), %r11 + shrd %cl, %r11, %r10 + add R32(%rbx), R32(%rbx) + mov 16(vp,%rax,8), %r8 + ADCSBB 8(up,%rax,8), %r10 + sbb R32(%rbx), R32(%rbx) + add $2, %rax + jz L(end) + + ALIGN(16) +L(top): mov 8(vp,%rax,8), %r9 + mov %r11, %rbp +L(lo2): mov %r10, 24(rp,%rax,8) C offset 24 +L(lo1): shrd %cl, %r8, %rbp + shrd %cl, %r9, %r8 + mov 16(vp,%rax,8), %r10 + mov 24(vp,%rax,8), %r11 + shrd %cl, %r10, %r9 + shrd %cl, %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB (up,%rax,8), %rbp + ADCSBB 8(up,%rax,8), %r8 + mov %r8, 40(rp,%rax,8) C offset 40 + ADCSBB 16(up,%rax,8), %r9 + mov 32(vp,%rax,8), %r8 + ADCSBB 24(up,%rax,8), %r10 + sbb R32(%rbx), R32(%rbx) + add $4, %rax + mov %rbp, (rp,%rax,8) C offset 32 +L(lo0): +L(lo3): mov %r9, 16(rp,%rax,8) C offset 48 + jnz L(top) + +L(end): mov %r10, 24(rp,%rax,8) +L(wd1): shrd %cl, %r8, %r11 + add R32(%rbx), R32(%rbx) + ADCSBB (up,%rax,8), %r11 + mov %r11, 32(rp,%rax,8) C offset 32 + adc R32(%rax), R32(%rax) C rax is zero after loop + shr R8(%rcx), %r8 + ADDSUB %r8, %rax +IFRSB( neg %rax) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt +IFDOS(` mov 64(%rsp), %r9 ') C cy + push %rbx + neg cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm new file mode 100644 index 0000000..1be829f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/aorsmul_1.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nehalem. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.0 +C AMD K10 4.0 +C AMD bull 5.0 +C AMD pile 4.84 5.39 +C AMD steam +C AMD excavator +C AMD bobcat 5.56 +C AMD jaguar 5.30 +C Intel P4 15.7 17.2 +C Intel core2 5.15 +C Intel NHM 4.56 +C Intel SBR 3.44 +C Intel HWL 3.03 +C Intel BWL 2.77 +C Intel SKL 2.76 +C Intel atom 21 +C Intel SLM 11 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C N.B.: Be careful if editing, making sure the loop alignment padding does not +C become large, as we currently fall into it. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + + mov (up), %rax + lea -8(up,n_param,8), up + mov (rp), %r8 + lea -8(rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): test $2, R8(n_param) + jnz L(b10) + +L(b00): mov $3, R32(n) + sub n_param, n + mul v0 + mov $0, R32(%r11) + mov %r8, %r10 + ADDSUB %rax, %r10 + mov -8(up,n,8), %rax + adc %rdx, %r11 + jmp L(lo0) + +L(b10): mov $1, R32(n) + sub n_param, n + mul v0 + mov %r8, %r10 + mov $0, R32(%r11) + ADDSUB %rax, %r10 + mov 8(up,n,8), %rax + adc %rdx, %r11 + jmp L(lo2) + +L(bx1): test $2, R8(n_param) + jz L(b01) + +L(b11): mov $2, R32(n) + sub n_param, n + mul v0 + ADDSUB %rax, %r8 + mov $0, R32(%r9) + mov (up,n,8), %rax + adc %rdx, %r9 + jmp L(lo3) + +L(b01): mov $0, R32(n) + sub n_param, n + xor %r11, %r11 + add $4, n + jc L(end) + + ALIGN(32) +L(top): mul v0 + ADDSUB %rax, %r8 + mov $0, R32(%r9) + mov -16(up,n,8), %rax + adc %rdx, %r9 +L(lo1): mul v0 + ADDSUB %r11, %r8 + mov $0, R32(%r11) + mov -16(rp,n,8), %r10 + adc $0, %r9 + ADDSUB %rax, %r10 + mov -8(up,n,8), %rax + adc %rdx, %r11 + mov %r8, -24(rp,n,8) + ADDSUB %r9, %r10 + adc $0, %r11 +L(lo0): mov -8(rp,n,8), %r8 + mul v0 + ADDSUB %rax, %r8 + mov $0, R32(%r9) + mov (up,n,8), %rax + adc %rdx, %r9 + mov %r10, -16(rp,n,8) + ADDSUB %r11, %r8 + adc $0, %r9 +L(lo3): mul v0 + mov (rp,n,8), %r10 + mov $0, R32(%r11) + ADDSUB %rax, %r10 + mov 8(up,n,8), %rax + adc %rdx, %r11 + mov %r8, -8(rp,n,8) + ADDSUB %r9, %r10 + adc $0, %r11 +L(lo2): mov 8(rp,n,8), %r8 + mov %r10, (rp,n,8) + add $4, n + jnc L(top) + +L(end): mul v0 + ADDSUB %rax, %r8 + mov $0, R32(%rax) + adc %rdx, %rax + ADDSUB %r11, %r8 + adc $0, %rax + mov %r8, (rp) + + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h new file mode 100644 index 0000000..f56c128 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/gmp-mparam.h @@ -0,0 +1,238 @@ +/* Nehalem gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2933-3200 MHz Intel Xeon X3470 Nehalem */ +/* FFT tuning limit = 468,424,931 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 16 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 10 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 17 + +#define DIV_1_VS_MUL_1_PERCENT 301 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 59 +#define MUL_TOOM44_THRESHOLD 169 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 104 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 101 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 147 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 98 +#define SQR_TOOM4_THRESHOLD 250 +#define SQR_TOOM6_THRESHOLD 351 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 28 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 372 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 372, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 10, 5}, { 21, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31, 8}, { 511,10}, \ + { 135,11}, { 79,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 543,11}, { 1087,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,12}, \ + { 2431,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,12}, { 2815,13}, { 1471,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 3839,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,17}, { 1023,16}, \ + { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,17}, { 2047,16}, { 4607,15}, \ + { 9983,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 204 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 135,11}, \ + { 79, 9}, { 319, 6}, { 2687, 7}, { 1407, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,11}, { 319,10}, { 639,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 671,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 447,11}, \ + { 895,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \ + { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,12}, { 2431,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \ + { 3455,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6655,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 218 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 49 +#define MULLO_MUL_N_THRESHOLD 8397 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 11 +#define SQRLO_SQR_THRESHOLD 7035 + +#define DC_DIV_QR_THRESHOLD 47 +#define DC_DIVAPPR_Q_THRESHOLD 151 +#define DC_BDIV_QR_THRESHOLD 40 +#define DC_BDIV_Q_THRESHOLD 30 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 199 +#define INV_APPR_THRESHOLD 157 + +#define BINV_NEWTON_THRESHOLD 254 +#define REDC_1_TO_REDC_N_THRESHOLD 48 + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 83 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1308 + +#define POWM_SEC_TABLE 1,64,66,452,1486 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 141 +#define SET_STR_PRECOMPUTE_THRESHOLD 1023 + +#define FAC_DSC_THRESHOLD 182 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 5 /* 2.91% faster than 3 */ +#define HGCD_THRESHOLD 116 +#define HGCD_APPR_THRESHOLD 164 +#define HGCD_REDUCE_THRESHOLD 2205 +#define GCD_DC_THRESHOLD 321 +#define GCDEXT_DC_THRESHOLD 358 +#define JACOBI_BASE_METHOD 4 /* 0.12% faster than 1 */ + +/* Tuneup completed successfully, took 452116 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm new file mode 100644 index 0000000..a5a63e4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/hamdist.asm @@ -0,0 +1,196 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 3.26 +C AMD bd1 4.2 +C AMD bd2 4.2 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 1.15 +C AMD bobcat 7.29 +C AMD jaguar 2.53 +C Intel P4 n/a +C Intel core2 n/a +C Intel NHM 2.03 +C Intel SBR 1.66 +C Intel IBR 1.62 +C Intel HWL 1.50 +C Intel BWL 1.50 +C Intel SKL 1.50 +C Intel atom n/a +C Intel SLM 2.55 +C VIA nano n/a + +C TODO +C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later +C Intel hardware. Perhaps mix such a loop with popcnt instructions. +C * The random placement of the L0, L1, L2, etc blocks are due to branch +C shortening. More work could be done there. +C * Combine the accumulators rax and rcx into one register to save some +C bookkeeping and a push/pop pair. Unfortunately this cause a slight +C slowdown for at leat NHM and SBR. + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`sum', `lea ($1,$2), $2') +define(`sum', `add $1, $2') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + FUNC_ENTRY(3) + push %rbx + push %rbp + + mov (up), %r10 + xor (vp), %r10 + + mov R32(n), R32(%r8) + and $3, R32(%r8) + + xor R32(%rcx), R32(%rcx) + .byte 0xf3,0x49,0x0f,0xb8,0xc2 C popcnt %r10,%rax + + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%r8,4), %r8 + add %r9, %r8 + jmp *%r8 +',` + jmp *(%r9,%r8,8) +') + +L(3): mov 8(up), %r10 + mov 16(up), %r11 + xor 8(vp), %r10 + xor 16(vp), %r11 + xor R32(%rbp), R32(%rbp) + sub $4, n + jle L(x3) + mov 24(up), %r8 + mov 32(up), %r9 + add $24, up + add $24, vp + jmp L(e3) + +L(0): mov 8(up), %r9 + xor 8(vp), %r9 + mov 16(up), %r10 + mov 24(up), %r11 + xor R32(%rbx), R32(%rbx) + xor 16(vp), %r10 + xor 24(vp), %r11 + add $32, up + add $32, vp + sub $4, n + jle L(x4) + + ALIGN(16) +L(top): +L(e0): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp + mov (up), %r8 + mov 8(up), %r9 + sum( %rbx, %rax) +L(e3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx + xor (vp), %r8 + xor 8(vp), %r9 + sum( %rbp, %rcx) +L(e2): .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp + mov 16(up), %r10 + mov 24(up), %r11 + add $32, up + sum( %rbx, %rax) +L(e1): .byte 0xf3,0x49,0x0f,0xb8,0xd8 C popcnt %r8,%rbx + xor 16(vp), %r10 + xor 24(vp), %r11 + add $32, vp + sum( %rbp, %rcx) + sub $4, n + jg L(top) + +L(x4): .byte 0xf3,0x49,0x0f,0xb8,0xe9 C popcnt %r9,%rbp + sum( %rbx, %rax) +L(x3): .byte 0xf3,0x49,0x0f,0xb8,0xda C popcnt %r10,%rbx + sum( %rbp, %rcx) + .byte 0xf3,0x49,0x0f,0xb8,0xeb C popcnt %r11,%rbp + sum( %rbx, %rax) + sum( %rbp, %rcx) +L(x2): add %rcx, %rax +L(x1): pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(2): mov 8(up), %r11 + xor 8(vp), %r11 + sub $2, n + jle L(n2) + mov 16(up), %r8 + mov 24(up), %r9 + xor R32(%rbx), R32(%rbx) + xor 16(vp), %r8 + xor 24(vp), %r9 + add $16, up + add $16, vp + jmp L(e2) +L(n2): .byte 0xf3,0x49,0x0f,0xb8,0xcb C popcnt %r11,%rcx + jmp L(x2) + +L(1): dec n + jle L(x1) + mov 8(up), %r8 + mov 16(up), %r9 + xor 8(vp), %r8 + xor 16(vp), %r9 + xor R32(%rbp), R32(%rbp) + mov 24(up), %r10 + mov 32(up), %r11 + add $40, up + add $8, vp + jmp L(e1) + +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm new file mode 100644 index 0000000..0a3c867 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/popcount.asm @@ -0,0 +1,182 @@ +dnl AMD64 mpn_popcount -- population count. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 1.39 +C AMD bd1 4 +C AMD bd2 4 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 0.72 +C AMD bobcat 5.78 +C AMD jaguar 1.27 +C Intel P4 n/a +C Intel core2 n/a +C Intel NHM 1.04 +C Intel SBR 1.02 +C Intel IBR 1.0 +C Intel HWL 1.0 +C Intel BWL 1.0 +C Intel SKL 1.0 +C Intel atom n/a +C Intel SLM 1.34 +C VIA nano n/a + +C TODO +C * We could approach 0.5 c/l for AMD Zen with more unrolling. That would +C not cause any additional feed-in overhead as we already use a jump table. +C * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later +C Intel hardware. Perhaps mix such a loop with popcnt instructions. +C * The random placement of the L0, L1, L2, etc blocks are due to branch +C shortening. + +define(`up', `%rdi') +define(`n', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + FUNC_ENTRY(2) + + mov R32(n), R32(%r8) + and $7, R32(%r8) + + .byte 0xf3,0x48,0x0f,0xb8,0x07 C popcnt (up), %rax + xor R32(%rcx), R32(%rcx) + + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%r8,4), %r8 + add %r9, %r8 + jmp *%r8 +',` + jmp *(%r9,%r8,8) +') + +L(3): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 8(up), %r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 16(up), %r11 + add $24, up + sub $8, n + jg L(e34) + add %r10, %rax + add %r11, %rax +L(s1): FUNC_EXIT() + ret + +L(1): sub $8, n + jle L(s1) + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 8(up), %r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 16(up), %r9 + add $8, up + jmp L(e12) + +L(7): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 C popcnt 0x8(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 C popcnt 0x10(%rdi),%r11 + add $-8, up + jmp L(e07) + +L(0): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx + .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 + jmp L(e07) + +L(4): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx + .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 + add $32, up + sub $8, n + jle L(x4) + + ALIGN(16) +L(top): +L(e34): .byte 0xf3,0x4c,0x0f,0xb8,0x07 C popcnt (%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%r9 + add %r10, %rcx + add %r11, %rax +L(e12): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 C popcnt 0x10(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 C popcnt 0x18(%rdi),%r11 + add %r8, %rcx + add %r9, %rax +L(e07): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 C popcnt 0x20(%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 C popcnt 0x28(%rdi),%r9 + add %r10, %rcx + add %r11, %rax +L(e56): .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 C popcnt 0x30(%rdi),%r10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 C popcnt 0x38(%rdi),%r11 + add $64, up + add %r8, %rcx + add %r9, %rax + sub $8, n + jg L(top) + +L(x4): add %r10, %rcx + add %r11, %rax +L(x2): add %rcx, %rax + + FUNC_EXIT() + ret + +L(2): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx + sub $8, n + jle L(x2) + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9 + add $16, up + jmp L(e12) + +L(5): .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 C popcnt 0x8(%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 C popcnt 0x10(%rdi),%r9 + add $-24, up + jmp L(e56) + +L(6): .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 C popcnt 0x8(%rdi),%rcx + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 C popcnt 0x10(%rdi),%r8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 C popcnt 0x18(%rdi),%r9 + add $-16, up + jmp L(e56) +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm new file mode 100644 index 0000000..fc71c1b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/redc_1.asm @@ -0,0 +1,549 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea (up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 3(n), i + mov (mp,n,8), %rax + mov (up,n,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov 8(mp,n,8), %rax + adc %rdx, %r9 + mul q0 + mov $0, R32(%r11) + mov 8(up,n,8), %rbx + add %rax, %rbx + mov 16(mp,n,8), %rax + adc %rdx, %r11 + add %r9, %rbx + adc $0, %r11 + mov 16(up,n,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov 24(mp,n,8), %rax + adc %rdx, %r9 + mov %rbx, 8(up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov -16(mp,i,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbp + mov $0, R32(%r11) + mov -16(up,i,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov -8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -24(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov -8(up,i,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov (mp,i,8), %rax + adc %rdx, %r9 + mov %r10, -16(up,i,8) +L(e1): add %r11, %rbp + adc $0, %r9 + mul q0 + mov (up,i,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -8(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov 8(up,i,8), %rbp + mov %r10, (up,i,8) + add $4, i + jnc L(tp1) + +L(ed1): mul q0 + add %rax, %rbp + adc $0, %rdx + add %r11, %rbp + adc $0, %rdx + mov %rbp, I(-8(up),-24(up,i,8)) + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 5(n), i + mov (mp,n,8), %rax + mov (up,n,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov 8(mp,n,8), %rax + adc %rdx, %r9 + mul q0 + mov 8(up,n,8), %rbx + mov $0, R32(%r11) + add %rax, %rbx + mov 16(mp,n,8), %rax + adc %rdx, %r11 + add %r9, %rbx + adc $0, %r11 + mov 16(up,n,8), %rbp + mov %rbx, 8(up,n,8) + imul u0inv, %rbx C next q limb +C jmp L(tp3) + + ALIGNx +L(tp3): mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov -16(mp,i,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbp + mov $0, R32(%r11) + mov -16(up,i,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov -8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -24(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov -8(up,i,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov (mp,i,8), %rax + adc %rdx, %r9 + mov %r10, -16(up,i,8) + add %r11, %rbp + adc $0, %r9 + mul q0 + mov (up,i,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -8(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov 8(up,i,8), %rbp + mov %r10, (up,i,8) + add $4, i + jnc L(tp3) + +L(ed3): mul q0 + add %rax, %rbp + adc $0, %rdx + add %r11, %rbp + adc $0, %rdx + mov %rbp, I(-8(up),-24(up,i,8)) + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): +L(otp0):lea 2(n), i + mov (mp,n,8), %rax + mul q0 + mov $0, R32(%r11) + mov (up,n,8), %r10 + add %rax, %r10 + mov 8(mp,n,8), %rax + adc %rdx, %r11 + mov 8(up,n,8), %rbx + mul q0 + add %rax, %rbx + mov $0, R32(%r9) + mov 16(mp,n,8), %rax + adc %rdx, %r9 + add %r11, %rbx + adc $0, %r9 + mul q0 + mov 16(up,n,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 24(mp,n,8), %rax + adc %rdx, %r11 + mov %rbx, 8(up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov -16(mp,i,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbp + mov $0, R32(%r11) + mov -16(up,i,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov -8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -24(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov -8(up,i,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov (mp,i,8), %rax + adc %rdx, %r9 + mov %r10, -16(up,i,8) + add %r11, %rbp + adc $0, %r9 + mul q0 + mov (up,i,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -8(up,i,8) +L(e0): add %r9, %r10 + adc $0, %r11 + mov 8(up,i,8), %rbp + mov %r10, (up,i,8) + add $4, i + jnc L(tp0) + +L(ed0): mul q0 + add %rax, %rbp + adc $0, %rdx + add %r11, %rbp + adc $0, %rdx + mov %rbp, I(-8(up),-24(up,i,8)) + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 4(n), i + mov (mp,n,8), %rax + mul q0 + mov (up,n,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,n,8), %rax + adc %rdx, %r11 + mov 8(up,n,8), %rbx + mul q0 + add %rax, %rbx + mov $0, R32(%r9) + mov 16(mp,n,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbx + mov $0, R32(%r11) + mov 16(up,n,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov 24(mp,n,8), %rax + adc %rdx, %r11 + mov %rbx, 8(up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov -16(mp,i,8), %rax + adc %rdx, %r9 + mul q0 + add %r11, %rbp + mov $0, R32(%r11) + mov -16(up,i,8), %r10 + adc $0, %r9 + add %rax, %r10 + mov -8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -24(up,i,8) +L(e2): add %r9, %r10 + adc $0, %r11 + mov -8(up,i,8), %rbp + mul q0 + add %rax, %rbp + mov $0, R32(%r9) + mov (mp,i,8), %rax + adc %rdx, %r9 + mov %r10, -16(up,i,8) + add %r11, %rbp + adc $0, %r9 + mul q0 + mov (up,i,8), %r10 + mov $0, R32(%r11) + add %rax, %r10 + mov 8(mp,i,8), %rax + adc %rdx, %r11 + mov %rbp, -8(up,i,8) + add %r9, %r10 + adc $0, %r11 + mov 8(up,i,8), %rbp + mov %r10, (up,i,8) + add $4, i + jnc L(tp2) + +L(ed2): mul q0 + add %rax, %rbp + adc $0, %rdx + add %r11, %rbp + adc $0, %rdx + mov %rbp, I(-8(up),-24(up,i,8)) + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -8(up), %rax + adc (up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov (up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 8(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -24(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov -8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -16(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -8(up) + mov %r11, -24(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -48(up), %rdx + mov -40(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc -8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreinhm/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm new file mode 100644 index 0000000..21f0bf4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/addmul_2.asm @@ -0,0 +1,224 @@ +dnl AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.93 this +C Intel IBR 2.66 this +C Intel HWL 2.5 2.15 +C Intel BWL +C Intel atom +C VIA nano + +C This code is the result of running a code generation and optimisation tool +C suite written by David Harvey and Torbjorn Granlund. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') +define(`X0', `%r12') +define(`X1', `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + + mov n_param, n + neg n + + lea (up,n_param,8), up + lea 8(rp,n_param,8), rp + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): mov -8(rp,n,8), X0 + mov %rdx, w1 + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + xor w0, w0 + xor w3, w3 + test $2, R8(n) + jnz L(b10) + +L(b00): nop C this nop make loop go faster on SBR! + mul v1 + mov (rp,n,8), X1 + jmp L(lo0) + +L(b10): lea -2(n), n + jmp L(lo2) + +L(bx1): mov -8(rp,n,8), X1 + mov %rdx, w3 + add %rax, X1 + adc $0, w3 + mov (up,n,8), %rax + xor w1, w1 + xor w2, w2 + test $2, R8(n) + jz L(b11) + +L(b01): mov (rp,n,8), X0 + inc n + jmp L(lo1) + +L(b11): dec n + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo1): mul v1 + mov %rdx, w0 C 1 + add %rax, X0 C 0 + adc $0, w0 C 1 + add w1, X1 C 3 + adc $0, w3 C 0 + add w2, X0 C 0 + adc $0, w0 C 1 + mov (up,n,8), %rax + mul v0 + add %rax, X0 C 0 + mov %rdx, w1 C 1 + adc $0, w1 C 1 + mov (up,n,8), %rax + mul v1 + mov X1, -16(rp,n,8) C 3 + mov (rp,n,8), X1 C 1 + add w3, X0 C 0 + adc $0, w1 C 1 +L(lo0): mov %rdx, w2 C 2 + mov X0, -8(rp,n,8) C 0 + add %rax, X1 C 1 + adc $0, w2 C 2 + mov 8(up,n,8), %rax + add w0, X1 C 1 + adc $0, w2 C 2 + mul v0 + add %rax, X1 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + mov 8(up,n,8), %rax +L(lo3): mul v1 + add w1, X1 C 1 + mov 8(rp,n,8), X0 C 2 + adc $0, w3 C 2 + mov %rdx, w0 C 3 + add %rax, X0 C 2 + adc $0, w0 C 3 + mov 16(up,n,8), %rax + mul v0 + add w2, X0 C 2 + mov X1, (rp,n,8) C 1 + mov %rdx, w1 C 3 + adc $0, w0 C 3 + add %rax, X0 C 2 + adc $0, w1 C 3 + mov 16(up,n,8), %rax + add w3, X0 C 2 + adc $0, w1 C 3 +L(lo2): mul v1 + mov 16(rp,n,8), X1 C 3 + add %rax, X1 C 3 + mov %rdx, w2 C 4 + adc $0, w2 C 4 + mov 24(up,n,8), %rax + mov X0, 8(rp,n,8) C 2 + mul v0 + add w0, X1 C 3 + mov %rdx, w3 C 4 + adc $0, w2 C 4 + add %rax, X1 C 3 + mov 24(up,n,8), %rax + mov 24(rp,n,8), X0 C 0 useless but harmless final read + adc $0, w3 C 4 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-16(rp),-16(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I(-8(rp),-8(rp,n,8)) + mov %rdx, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm new file mode 100644 index 0000000..2319a80 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh1_n.asm @@ -0,0 +1,54 @@ +dnl AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1) +dnl AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh1_n) + define(func_nc, mpn_addlsh1_nc)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh1_n) + define(func_nc, mpn_rsblsh1_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/coreisbr/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm new file mode 100644 index 0000000..3b7bb22 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh2_n.asm @@ -0,0 +1,56 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh2_n) + define(func_nc, mpn_addlsh2_nc)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh2_n) + define(func_nc, mpn_rsblsh2_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C mpn_rsblsh2_nc removed below, its idea of carry-in is inconsistent with +C refmpn_rsblsh2_nc +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n) +include_mpn(`x86_64/coreisbr/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm new file mode 100644 index 0000000..23ace41 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlshC_n.asm @@ -0,0 +1,173 @@ +dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C) +dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[] + +dnl Copyright 2009-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 3.25 +C Intel NHM 4 +C Intel SBR 2 C (or 1.95 when L(top)'s alignment = 16 (mod 32)) +C Intel atom ? +C VIA nano ? + +C This code probably runs close to optimally on Sandy Bridge (using 4-way +C unrolling). It also runs reasonably well on Core 2, but it runs poorly on +C all other processors, including Nehalem. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + mov cy, %rax + neg %rax C set msb on carry + xor R32(%rbp), R32(%rbp) C limb carry + mov (vp), %r8 + shrd $RSH, %r8, %rbp + mov R32(n), R32(%r9) + and $3, R32(%r9) + je L(b00) + cmp $2, R32(%r9) + jc L(b01) + je L(b10) + jmp L(b11) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbp + xor R32(%rbp), R32(%rbp) C limb carry + mov (vp), %r8 + shrd $RSH, %r8, %rbp + mov R32(n), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + mov 16(vp), %r10 + shrd $RSH, %r10, %r9 + add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, 16(rp) + mov %r10, %rbp + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $3, n + ja L(top) + jmp L(end) + +L(b01): add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + mov %rbp, (rp) + mov %r8, %rbp + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $1, n + ja L(top) + jmp L(end) + +L(b10): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + add R32(%rax), R32(%rax) C init carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, %rbp + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $2, n + ja L(top) + jmp L(end) + + ALIGN(16) +L(top): mov (vp), %r8 + shrd $RSH, %r8, %rbp +L(b00): mov 8(vp), %r9 + shrd $RSH, %r9, %r8 + mov 16(vp), %r10 + shrd $RSH, %r10, %r9 + mov 24(vp), %r11 + shrd $RSH, %r11, %r10 + lea 32(vp), vp + add R32(%rax), R32(%rax) C restore carry flag + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + mov %rbp, (rp) + mov %r8, 8(rp) + mov %r9, 16(rp) + mov %r10, 24(rp) + mov %r11, %rbp + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry flag + sub $4, n + jnz L(top) + +L(end): shr $RSH, %rbp + add R32(%rax), R32(%rax) C restore carry flag + ADCSBB $0, %rbp + mov %rbp, %rax + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm new file mode 100644 index 0000000..db8ee68 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorrlsh_n.asm @@ -0,0 +1,215 @@ +dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k) +dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[] +dnl Optimised for Sandy Bridge. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 5.25 +C Intel P4 ? +C Intel core2 3.1 +C Intel NHM 3.95 +C Intel SBR 2.75 +C Intel atom ? +C VIA nano ? + +C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way +C unrolling). The rest of the code is quite crude, and could perhaps be made +C both smaller and faster. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') +define(`cy', `%r9') C for _nc variant + +ifdef(`OPERATION_addlsh_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(IFRSB, ) + define(func_n, mpn_addlsh_n) + define(func_nc, mpn_addlsh_nc)') +ifdef(`OPERATION_rsblsh_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(IFRSB, `$1') + define(func_n, mpn_rsblsh_n) + define(func_nc, mpn_rsblsh_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with +C refmpn_rsblsh_nc +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt + push %rbx + xor R32(%rbx), R32(%rbx) C clear CF save register +L(ent): push %rbp + mov R32(n), R32(%rbp) + mov n, %rax + mov R32(cnt), R32(%rcx) + neg R32(%rcx) + and $3, R32(%rbp) + jz L(b0) + lea -32(vp,%rbp,8), vp + lea -32(up,%rbp,8), up + lea -32(rp,%rbp,8), rp + cmp $2, R32(%rbp) + jc L(b1) + jz L(b2) + +L(b3): xor %r8, %r8 + mov 8(vp), %r9 + mov 16(vp), %r10 + shrd R8(%rcx), %r9, %r8 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $3, %rax + jz L(3) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + jmp L(lo3) +L(3): add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + jmp L(wd3) + +L(b0): mov (vp), %r8 + mov 8(vp), %r9 + xor R32(%rbp), R32(%rbp) + jmp L(lo0) + +L(b1): xor %r10, %r10 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $1, %rax + jz L(1) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 24(up), %r10 + lea 32(up), up + mov (vp), %r8 + jmp L(lo1) +L(1): add R32(%rbx), R32(%rbx) + ADCSBB 24(up), %r10 + jmp L(wd1) + +L(b2): xor %r9, %r9 + mov 16(vp), %r10 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + shrd R8(%rcx), %r11, %r10 + sub $2, %rax + jz L(2) + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + lea 32(up), up + jmp L(lo2) +L(2): add R32(%rbx), R32(%rbx) + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + jmp L(wd2) + + ALIGN(32) C 16-byte alignment is not enough! +L(top): shrd R8(%rcx), %r11, %r10 + add R32(%rbx), R32(%rbx) + lea 32(vp), vp + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + mov %rbp, (rp) + lea 32(up), up +L(lo3): mov %r8, 8(rp) +L(lo2): mov %r9, 16(rp) + mov (vp), %r8 +L(lo1): mov %r10, 24(rp) + mov 8(vp), %r9 + mov %r11, %rbp + lea 32(rp), rp + sbb R32(%rbx), R32(%rbx) +L(lo0): shrd R8(%rcx), %r8, %rbp + mov 16(vp), %r10 + shrd R8(%rcx), %r9, %r8 + shrd R8(%rcx), %r10, %r9 + mov 24(vp), %r11 + sub $4, %rax + jg L(top) + + shrd R8(%rcx), %r11, %r10 + add R32(%rbx), R32(%rbx) + ADCSBB (up), %rbp + ADCSBB 8(up), %r8 + ADCSBB 16(up), %r9 + ADCSBB 24(up), %r10 + mov %rbp, (rp) +L(wd3): mov %r8, 8(rp) +L(wd2): mov %r9, 16(rp) +L(wd1): mov %r10, 24(rp) + adc R32(%rax), R32(%rax) C rax is zero after loop + shr R8(%rcx), %r11 + ADDSUB %r11, %rax +IFRSB( neg %rax) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') C cnt +IFDOS(` mov 64(%rsp), %r9 ') C cy + push %rbx + neg cy + sbb R32(%rbx), R32(%rbx) C initialise CF save register + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm new file mode 100644 index 0000000..61fee3e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aors_n.asm @@ -0,0 +1,203 @@ +dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and +dnl Haswell. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1.75\2.52 +C AMD K10 1.5 +C AMD bd1 1.69\2.25 +C AMD bd2 1.65 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 1.5 +C AMD bt1 2.67 +C AMD bt2 2.16 +C Intel P4 11.54 +C Intel PNR 5 +C Intel NHM 5.5 +C Intel SBR 1.54 +C Intel IBR 1.5 +C Intel HWL 1.32 +C Intel BWL 1.07 +C Intel SKL 1.21 +C Intel atom 4.3 +C Intel SLM 3 +C VIA nano ? + +C The loop of this code was manually written. It runs close to optimally on +C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems. +C It also runs slightly faster on average on AMD bd1 and bd2. +C +C No micro-optimisation has been done. +C +C N.B.! The loop alignment padding insns are executed. If editing the code, +C make sure the padding does not become excessive. It is now a 4-byte nop. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 + +L(ent): mov R32(n), R32(%rax) + shr $2, n + + test $1, R8(%rax) + jnz L(bx1) + +L(bx0): test $2, R8(%rax) + jnz L(b10) + +L(b00): neg %r8 + mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + lea 32(vp), vp + lea -16(rp), rp + jmp L(lo0) + +L(b10): neg %r8 + mov (up), %r10 + mov 8(up), %r11 + ADCSBB 0(vp), %r10 + ADCSBB 8(vp), %r11 + jrcxz L(e2) + mov 16(up), %r8 + mov 24(up), %r9 + lea 16(up), up + ADCSBB 16(vp), %r8 + ADCSBB 24(vp), %r9 + lea 16(vp), vp +C lea (rp), rp + jmp L(lo2) + +L(e2): mov %r10, (rp) + mov %r11, 8(rp) + setc R8(%rax) + FUNC_EXIT() + ret + +L(bx1): test $2, R8(%rax) + jnz L(b11) + +L(b01): neg %r8 + mov (up), %r11 + ADCSBB (vp), %r11 + jrcxz L(e1) + mov 8(up), %r8 + mov 16(up), %r9 + lea 8(up), up + lea -8(rp), rp + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + lea 8(vp), vp + jmp L(lo1) + +L(e1): mov %r11, (rp) + setc R8(%rax) + FUNC_EXIT() + ret + +L(b11): neg %r8 + mov (up), %r9 + ADCSBB (vp), %r9 + mov 8(up), %r10 + mov 16(up), %r11 + lea 24(up), up + ADCSBB 8(vp), %r10 + ADCSBB 16(vp), %r11 + lea 24(vp), vp + mov %r9, (rp) + lea 8(rp), rp + jrcxz L(end) + + ALIGN(32) +L(top): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + ADCSBB 8(vp), %r9 +L(lo2): mov %r10, (rp) +L(lo1): mov %r11, 8(rp) + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + ADCSBB 16(vp), %r10 + ADCSBB 24(vp), %r11 + lea 32(vp), vp +L(lo0): mov %r8, 16(rp) +L(lo3): mov %r9, 24(rp) + lea 32(rp), rp + dec n + jnz L(top) + +L(end): mov R32(n), R32(%rax) C zero rax + mov %r10, (rp) + mov %r11, 8(rp) + setc R8(%rax) + FUNC_EXIT() + ret +EPILOGUE() + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm new file mode 100644 index 0000000..b4c1572 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/aorsmul_1.asm @@ -0,0 +1,212 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.27 +C AMD K10 4.27 4.54 +C AMD bull 4.76 +C AMD pile 4.55 +C AMD steam +C AMD excavator +C AMD bobcat 5.30 +C AMD jaguar 5.28 +C Intel P4 16.2 17.1 +C Intel core2 5.26 +C Intel NHM 5.09 +C Intel SBR 3.21 +C Intel IBR 2.96 +C Intel HWL 2.81 +C Intel BWL 2.76 +C Intel SKL 2.76 +C Intel atom 21.5 +C Intel SLM 9.5 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjörn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +define(`I',`$1') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'')') dnl +IFDOS(` define(`rp', ``%rcx'')') dnl +IFDOS(` define(`v0', ``%r9'')') dnl +IFDOS(` define(`r9', ``rdi'')') dnl +IFDOS(` define(`n_param',``%r8'')') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax + push %rbx + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(b13) + +L(b02): xor R32(%r11), R32(%r11) + test $2, R8(n_param) + jnz L(b2) + +L(b0): mov $1, R32(n) + sub n_param, n + mul v0 + mov %rdx, %r9 + mov -8(rp,n,8), %r8 + jmp L(e0) + + ALIGN(16) +L(b2): mov $-1, n + sub n_param, n + mul v0 + mov 8(rp,n,8), %r8 + mov %rdx, %r9 + jmp L(e2) + + ALIGN(16) +L(b13): xor R32(%r9), R32(%r9) + test $2, R8(n_param) + jnz L(b3) + +L(b1): mov $2, R32(n) + sub n_param, n + jns L(1) + mul v0 + mov -16(rp,n,8), %r10 + mov %rdx, %r11 + jmp L(e1) + + ALIGN(16) +L(b3): xor R32(n), R32(n) + sub n_param, n + mul v0 + mov (rp,n,8), %r10 + jmp L(e3) + + ALIGN(32) +L(top): mul v0 + mov -16(rp,n,8), %r10 + ADDSUB %r11, %r8 + mov %rdx, %r11 + adc $0, %r9 + mov %r8, -24(rp,n,8) +L(e1): ADDSUB %rax, %r10 + mov -8(up,n,8), %rax + adc $0, %r11 + mul v0 + ADDSUB %r9, %r10 + mov %rdx, %r9 + mov -8(rp,n,8), %r8 + adc $0, %r11 + mov %r10, -16(rp,n,8) +L(e0): ADDSUB %rax, %r8 + adc $0, %r9 + mov (up,n,8), %rax + mul v0 + mov (rp,n,8), %r10 + ADDSUB %r11, %r8 + mov %r8, -8(rp,n,8) + adc $0, %r9 +L(e3): mov %rdx, %r11 + ADDSUB %rax, %r10 + mov 8(up,n,8), %rax + adc $0, %r11 + mul v0 + mov 8(rp,n,8), %r8 + ADDSUB %r9, %r10 + mov %rdx, %r9 + mov %r10, (rp,n,8) + adc $0, %r11 +L(e2): ADDSUB %rax, %r8 + adc $0, %r9 + mov 16(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + mov I(-8(rp),-16(rp,n,8)), %r10 + ADDSUB %r11, %r8 + mov %rdx, %r11 + adc $0, %r9 + mov %r8, I(-16(rp),-24(rp,n,8)) + ADDSUB %rax, %r10 + adc $0, %r11 + ADDSUB %r9, %r10 + adc $0, %r11 + mov %r10, I(-8(rp),-16(rp,n,8)) + mov %r11, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret + + ALIGN(16) +L(1): mul v0 + ADDSUB %rax, -8(rp) + mov %rdx, %rax + adc $0, %rax + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm new file mode 100644 index 0000000..43abcc8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_add_n.asm @@ -0,0 +1,174 @@ +dnl AMD64 mpn_cnd_add_n. + +dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR 3.0 +C Intel NHM 3.75 +C Intel SBR 1.93 +C Intel IBR 1.89 +C Intel HWL 1.78 +C Intel BWL 1.50 +C Intel SKL 1.50 +C Intel atom +C Intel SLM 4.0 +C VIA nano + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. + +C INPUT PARAMETERS +define(`cnd_arg', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +define(`cnd', `%rbx') + +define(ADDSUB, add) +define(ADCSBB, adc) +define(func, mpn_cnd_add_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_cnd_add_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + + neg cnd_arg + sbb cnd, cnd C make cnd mask + + test $1, R8(n) + jz L(x0) +L(x1): test $2, R8(n) + jz L(b1) + +L(b3): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + and cnd, %rdi + and cnd, %r9 + and cnd, %r10 + ADDSUB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sub $3, n + jnz L(top) + jmp L(end) + +L(x0): xor R32(%rax), R32(%rax) + test $2, R8(n) + jz L(top) + +L(b2): mov (vp), %rdi + mov 8(vp), %r9 + and cnd, %rdi + and cnd, %r9 + ADDSUB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sub $2, n + jnz L(top) + jmp L(end) + +L(b1): mov (vp), %rdi + and cnd, %rdi + ADDSUB (up), %rdi + mov %rdi, (rp) + sbb R32(%rax), R32(%rax) C save carry + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + dec n + jz L(end) + + ALIGN(16) +L(top): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + mov 24(vp), %r11 + lea 32(vp), vp + and cnd, %rdi + and cnd, %r9 + and cnd, %r10 + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + ADCSBB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + lea 32(up), up + mov %r11, 24(rp) + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry + sub $4, n + jnz L(top) + +L(end): neg R32(%rax) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm new file mode 100644 index 0000000..f55492b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/cnd_sub_n.asm @@ -0,0 +1,200 @@ +dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR 3.0 +C Intel NHM 2.75 +C Intel SBR 2.15 +C Intel IBR 1.96 +C Intel HWL 2.0 +C Intel BWL 1.65 +C Intel SKL 1.65 +C Intel atom +C Intel SLM 4.5 +C VIA nano + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. +C * Given that we have a dedicated cnd_add_n, it might look strange that this +C file provides cnd_add_n and not just cnd_sub_n. But that's harmless, and +C this file's generality might come in handy for some pipeline. + +C INPUT PARAMETERS +define(`cnd_arg', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +define(`cnd', `%rbx') + +ifdef(`OPERATION_cnd_add_n',` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n',` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + push %rbp + push %r12 + push %r13 + + neg cnd_arg + sbb cnd, cnd C make cnd mask + + test $1, R8(n) + jz L(x0) +L(x1): test $2, R8(n) + jz L(b1) + +L(b3): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + and cnd, %rdi + mov (up), %r12 + and cnd, %r9 + mov 8(up), %r13 + and cnd, %r10 + mov 16(up), %rbp + ADDSUB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + ADCSBB %r10, %rbp + mov %rbp, 16(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sub $3, n + jnz L(top) + jmp L(end) + +L(x0): xor R32(%rax), R32(%rax) + test $2, R8(n) + jz L(top) + +L(b2): mov (vp), %rdi + mov 8(vp), %r9 + mov (up), %r12 + and cnd, %rdi + mov 8(up), %r13 + and cnd, %r9 + ADDSUB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sub $2, n + jnz L(top) + jmp L(end) + +L(b1): mov (vp), %rdi + mov (up), %r12 + and cnd, %rdi + ADDSUB %rdi, %r12 + mov %r12, (rp) + sbb R32(%rax), R32(%rax) C save carry + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + dec n + jz L(end) + + ALIGN(16) +L(top): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + mov 24(vp), %r11 + lea 32(vp), vp + and cnd, %rdi + mov (up), %r12 + and cnd, %r9 + mov 8(up), %r13 + and cnd, %r10 + mov 16(up), %rbp + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + mov 24(up), %rax + lea 32(up), up + ADCSBB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + ADCSBB %r10, %rbp + mov %rbp, 16(rp) + ADCSBB %r11, %rax + mov %rax, 24(rp) + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry + sub $4, n + jnz L(top) + +L(end): neg R32(%rax) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm new file mode 100644 index 0000000..d9f371f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/divrem_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_divrem_1 + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_divrem_1 mpn_preinv_divrem_1) +include_mpn(`x86_64/divrem_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h new file mode 100644 index 0000000..36f4512 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/gmp-mparam.h @@ -0,0 +1,241 @@ +/* Sandy Bridge gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */ +/* FFT tuning limit = 468,152,320 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 9 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 24 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 30 + +#define DIV_1_VS_MUL_1_PERCENT 298 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 154 +#define MUL_TOOM6H_THRESHOLD 254 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 148 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 93 +#define SQR_TOOM4_THRESHOLD 248 +#define SQR_TOOM6_THRESHOLD 342 +#define SQR_TOOM8_THRESHOLD 462 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 396, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 95,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 167,11}, \ + { 95, 7}, { 1535, 8}, { 831,10}, { 223, 9}, \ + { 447,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 607,13}, { 319,12}, { 735,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1919,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3071,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1919,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4095,15}, { 8191,16}, { 4607,15}, { 9983,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 219 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 336 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 336, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 6}, \ + { 4351, 7}, { 2303, 8}, { 1215,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,10}, { 607,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,13}, { 2815,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4607,15}, { 9983,14}, { 19967,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 210 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 66 +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 172 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 92 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 170 +#define INV_APPR_THRESHOLD 167 + +#define BINV_NEWTON_THRESHOLD 228 +#define REDC_1_TO_REDC_2_THRESHOLD 36 +#define REDC_2_TO_REDC_N_THRESHOLD 55 + +#define MU_DIV_QR_THRESHOLD 1387 +#define MU_DIVAPPR_Q_THRESHOLD 1387 +#define MUPI_DIV_QR_THRESHOLD 77 +#define MU_BDIV_QR_THRESHOLD 1187 +#define MU_BDIV_Q_THRESHOLD 1442 + +#define POWM_SEC_TABLE 1,16,191,452,1297 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 1160 +#define SET_STR_PRECOMPUTE_THRESHOLD 2043 + +#define FAC_DSC_THRESHOLD 426 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 5 /* 0.74% faster than 3 */ +#define HGCD_THRESHOLD 96 +#define HGCD_APPR_THRESHOLD 60 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 1 /* 32.22% faster than 4 */ + +/* Tuneup completed successfully, took 276198 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm new file mode 100644 index 0000000..a1cbc31 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm new file mode 100644 index 0000000..ac90edb --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm new file mode 100644 index 0000000..a43a117 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_1.asm @@ -0,0 +1,199 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD excavator +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core2 +C Intel NHM +C Intel SBR 2.49 +C Intel IBR 2.32 +C Intel HWL 2.44 +C Intel BWL 2.43 +C Intel SKL 2.47 +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up_param',`%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 +define(`cin', `%r8') C stack + +define(`up', `%rsi') C same as rp_param +define(`n', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`rp', `%rcx')') +IFDOS(` define(`up_param',`%rdx')') +IFDOS(` define(`n_param', `%r8')') +IFDOS(` define(`v0', `%r9')') +IFDOS(` define(`cin', `48(%rsp)')') + +IFDOS(` define(`up', `%rsi')') +IFDOS(` define(`n', `%r8')') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1) +IFDOS(` push %rsi ') + mov (up_param), %rax +IFSTD(` mov n_param, n ') + lea (up_param,n_param,8), up + lea -8(rp,n_param,8), rp + neg n + mul v0 + + test $1, R8(n) + jz L(x0) +L(x1): mov %rax, %r11 + mov %rdx, %r10 + test $2, R8(n) + jnz L(01) + +L(11): mov 8(up,n,8), %rax + dec n + jmp L(L3) + +L(01): inc n + jnz L(L1) + mov %rax, (rp) + mov %rdx, %rax +IFDOS(` pop %rsi ') + ret + +L(x0): mov %rax, %r10 + mov %rdx, %r11 + mov 8(up,n,8), %rax + test $2, R8(n) + jz L(L0) + +L(10): add $-2, n + jmp L(L2) + + ALIGN(8) +L(top): mov %rdx, %r10 + add %rax, %r11 +L(L1): mov 0(up,n,8), %rax + adc $0, %r10 + mul v0 + add %rax, %r10 + mov %r11, 0(rp,n,8) + mov 8(up,n,8), %rax + mov %rdx, %r11 +L(L0c): adc $0, %r11 +L(L0): mul v0 + mov %r10, 8(rp,n,8) + add %rax, %r11 + mov %rdx, %r10 +L(L3c): mov 16(up,n,8), %rax + adc $0, %r10 +L(L3): mul v0 + mov %r11, 16(rp,n,8) + mov %rdx, %r11 + add %rax, %r10 +L(L2c): mov 24(up,n,8), %rax + adc $0, %r11 +L(L2): mul v0 + mov %r10, 24(rp,n,8) + add $4, n + jnc L(top) + +L(end): add %rax, %r11 + mov %rdx, %rax + adc $0, %rax + mov %r11, (rp) + +IFDOS(` pop %rsi ') + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(` push %rsi ') + mov (up_param), %rax +IFSTD(` mov n_param, n ') + lea (up_param,n_param,8), up + lea -8(rp,n_param,8), rp + neg n + mul v0 + + test $1, R8(n) + jz L(x0c) +L(x1c): mov %rax, %r11 + mov %rdx, %r10 + test $2, R8(n) + jnz L(01c) + +L(11c): add cin, %r11 + dec n + jmp L(L3c) + +L(01c): add cin, %r11 + inc n + jnz L(L1) + mov %r11, (rp) + mov %rdx, %rax + adc $0, %rax +IFDOS(` pop %rsi ') + ret + +L(x0c): mov %rax, %r10 + mov %rdx, %r11 + test $2, R8(n) + jz L(00c) + +L(10c): add $-2, n + add cin, %r10 + jmp L(L2c) + +L(00c): add cin, %r10 + mov 8(up,n,8), %rax + jmp L(L0c) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm new file mode 100644 index 0000000..781534d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_2.asm @@ -0,0 +1,167 @@ +dnl AMD64 mpn_mul_2 optimised for Intel Sandy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 8.03 +C AMD K10 8.03 +C AMD bull 9.19 +C AMD pile 9.16 +C AMD steam +C AMD excavator +C AMD bobcat 10.6 +C AMD jaguar 11.0 +C Intel P4 26.0 +C Intel core2 8.73 +C Intel NHM 8.55 +C Intel SBR 5.15 +C Intel IBR 4.57 +C Intel HWL 4.08 +C Intel BWL 4.10 +C Intel SKL 4.14 +C Intel atom 39.5 +C Intel SLM 26.3 +C VIA nano + +C This code is the result of running a code generation and optimisation tool +C suite written by David Harvey and Torbjorn Granlund. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(b1) + +L(b0): mov $0, R32(n) + sub n_param, n + xor w0, w0 + mul v0 + mov %rax, w2 + mov %rdx, w1 + mov (up,n,8), %rax + jmp L(lo0) + +L(b1): mov $1, R32(n) + sub n_param, n + xor w2, w2 + mul v0 + mov %rax, w0 + mov %rdx, w3 + mov -8(up,n,8), %rax + mul v1 + jmp L(lo1) + + ALIGN(32) +L(top): mul v0 + add %rax, w0 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + mov -8(up,n,8), %rax + mul v1 + add w1, w0 C 1 + adc $0, w3 C 2 +L(lo1): add %rax, w2 C 2 + mov w0, -8(rp,n,8) C 1 + mov %rdx, w0 C 3 + adc $0, w0 C 3 + mov (up,n,8), %rax + mul v0 + add %rax, w2 C 2 + mov %rdx, w1 C 3 + adc $0, w1 C 3 + add w3, w2 C 2 + mov (up,n,8), %rax + adc $0, w1 C 1 +L(lo0): mul v1 + mov w2, (rp,n,8) C 2 + add %rax, w0 C 3 + mov %rdx, w2 C 4 + mov 8(up,n,8), %rax + adc $0, w2 C 4 + add $2, n + jnc L(top) + +L(end): mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm new file mode 100644 index 0000000..35fd1cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mul_basecase.asm @@ -0,0 +1,407 @@ +dnl AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.5 2.5 - 2.95 +C Intel IBR 2.4 2.3 - 2.68 +C Intel HWL 2.35 2.0 - 2.5 +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Fix the addmul_2 fluctuation affecting SBR. +C * Improve feed-in code, avoiding zeroing of many registers and dummy adds in +C the loops at the expense of code size. +C * Adjoin a mul_3, avoiding slow mul_1 for odd vn. +C * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight +C speedup. +C * Further micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + mov un_param, un C free up rdx + neg un + + mov (up), %rax C shared for mul_1 and mul_2 + lea (up,un_param,8), up C point at operand end + lea (rp,un_param,8), rp C point at rp[un-1] + + mov (vp), v0 C shared for mul_1 and mul_2 + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn) + jz L(do_mul_2) + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... + mov %rdx, w1 + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(m110) + +L(m100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(m1l0) + +L(m110):lea (un), n C un = 2, 6, 10, ... + jmp L(m1l2) + +L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... + mov %rdx, w0 + test $2, R8(un) + jz L(m111) + +L(m101):lea 3(un), n C un = 1, 5, 9, ... + test n, n + js L(m1l1) + mov %rax, -8(rp) + mov %rdx, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(m111):lea 1(un), n C un = 3, 7, 11, ... + mov 8(up,un,8), %rax + jmp L(m1l3) + + ALIGN(16) C FIXME +L(m1tp):mov %rdx, w0 + add %rax, w1 +L(m1l1):mov -16(up,n,8), %rax + adc $0, w0 + mul v0 + add %rax, w0 + mov w1, -24(rp,n,8) + mov -8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(m1l0):mul v0 + mov w0, -16(rp,n,8) + add %rax, w1 + mov %rdx, w0 + mov (up,n,8), %rax + adc $0, w0 +L(m1l3):mul v0 + mov w1, -8(rp,n,8) + mov %rdx, w1 + add %rax, w0 + mov 8(up,n,8), %rax + adc $0, w1 +L(m1l2):mul v0 + mov w0, (rp,n,8) + add $4, n + jnc L(m1tp) + +L(m1ed):add %rax, w1 + adc $0, %rdx + mov w1, I(-8(rp),-24(rp,n,8)) + mov %rdx, I((rp),-16(rp,n,8)) + + dec R32(vn) + jz L(ret2) + + lea 8(vp), vp + lea 8(rp), rp + push %r12 + push %r13 + push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') + push %r12 + push %r13 + push %r14 + + mov 8(vp), v1 + + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea (un), n + xor w0, w0 + mov %rax, w2 + mov %rdx, w1 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + xor w1, w1 + xor w2, w2 + mov %rax, w0 + mov %rdx, w3 + jmp L(m2l1) + + ALIGN(32) +L(m2tp):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 +L(m2l1):mov -8(up,n,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,n,8) + mov %rdx, w0 + adc $0, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 +L(m2l0):mov (up,n,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,n,8) + add %rax, w0 + mov %rdx, w2 + mov 8(up,n,8), %rax + adc $0, w2 + add $2, n + jnc L(m2tp) + +L(m2ed):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + add $-2, R32(vn) + jz L(ret5) + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,un,8), %rax + mul v0 + test $1, R8(un) + jnz L(a1x1) + +L(a1x0):mov (rp,un,8), X0 + xor w0, w0 + mov %rdx, w1 + test $2, R8(un) + jnz L(a110) + +L(a100):lea 2(un), n C un = 4, 8, 12, ... + add %rax, X0 + adc $0, w1 + mov (up,un,8), %rax + mul v1 + mov 8(rp,un,8), X1 + jmp L(lo0) + +L(a110):lea (un), n C un = 2, 6, 10, ... + xor w3, w3 + jmp L(lo2) + +L(a1x1):mov (rp,un,8), X1 + xor w2, w2 + xor w1, w1 + test $2, R8(un) + jz L(a111) + +L(a101):lea 3(un), n C un = 1, 5, 9, ... + mov %rdx, w3 + add %rax, X1 + mov (up,un,8), %rax + mov 8(rp,un,8), X0 + adc $0, w3 + jmp L(top) + +L(a111):lea 1(un), n C un = 3, 7, 11, ... + jmp L(lo3) + + ALIGN(32) +L(top): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,n,8), %rax + mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,n,8), %rax + mul v1 + mov X1, -24(rp,n,8) + mov -8(rp,n,8), X1 + add w3, X0 + adc $0, w1 +L(lo0): mov %rdx, w2 + mov X0, -16(rp,n,8) + add %rax, X1 + adc $0, w2 + mov -8(up,n,8), %rax + add w0, X1 + adc $0, w2 + mul v0 +L(lo3): add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, X1 + mov (rp,n,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X1, -8(rp,n,8) + mov %rdx, w1 + adc $0, w0 +L(lo2): add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,n,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,n,8), %rax + mov X0, (rp,n,8) + mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,n,8), %rax + mov 16(rp,n,8), X0 C useless but harmless in final iter + adc $0, w3 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-8(rp),-24(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I((rp),-16(rp,n,8)) + mov %rdx, I(8(rp),-8(rp,n,8)) + + addl $-2, vn + lea 16(vp), vp + lea 16(rp), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 + pop %r13 + pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm new file mode 100644 index 0000000..a41a8ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/mullo_basecase.asm @@ -0,0 +1,384 @@ +dnl AMD64 mpn_mullo_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull +C AMD pile +C AMD steam +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR 2.5 2.95 +C Intel IBR 2.3 2.68 +C Intel HWL 2.0 2.5 +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Implement proper cor2, replacing current cor0. +C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) +C * Micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r8') +define(`X0', `%r14') +define(`X1', `%r15') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`i', `%rbp') +define(`v0', `%r9') +define(`v1', `%rbx') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + + mov (up), %rax + mov vp_param, vp + + cmp $4, n + jb L(small) + + mov (vp_param), v0 + push %rbx + lea (rp,n,8), rp C point rp at R[un] + push %rbp + lea (up,n,8), up C point up right after U's end + push %r12 + neg n + push %r13 + mul v0 + mov 8(vp), v1 + + test $1, R8(n) + jnz L(m2b1) + +L(m2b0):lea (n), i + xor w0, w0 + mov %rax, w2 + mov %rdx, w1 + jmp L(m2l0) + +L(m2b1):lea 1(n), i + xor w1, w1 + xor w2, w2 + mov %rax, w0 + mov %rdx, w3 + jmp L(m2l1) + + ALIGN(32) +L(m2tp):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 +L(m2l1):mov -8(up,i,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,i,8) + mov %rdx, w0 + adc $0, w0 + mov (up,i,8), %rax + mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 +L(m2l0):mov (up,i,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,i,8) + add %rax, w0 + mov %rdx, w2 C FIXME: dead in last iteration + mov 8(up,i,8), %rax + adc $0, w2 C FIXME: dead in last iteration + add $2, i + jnc L(m2tp) + +L(m2ed):imul v0, %rax + add w0, %rax + add w1, %rax + mov %rax, I(-8(rp),-8(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jge L(cor1) + + push %r14 + push %r15 + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + mov (up,n,8), %rax + mul v0 + test $1, R8(n) + jnz L(a1x1) + +L(a1x0):mov (rp,n,8), X1 + xor w2, w2 + xor w1, w1 + test $2, R8(n) + jnz L(a110) + +L(a100):lea 1(n), i + jmp L(lo0) + +L(a110):lea 3(n), i + mov %rdx, w3 + add %rax, X1 + mov (up,n,8), %rax + mov 8(rp,n,8), X0 + adc $0, w3 + jmp L(lo2) + +L(a1x1):mov (rp,n,8), X0 + xor w0, w0 + mov %rdx, w1 + test $2, R8(n) + jz L(a111) + +L(a101):lea 2(n), i + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + mul v1 + mov 8(rp,n,8), X1 + jmp L(lo1) + +L(a111):lea (n), i + xor w3, w3 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,i,8), %rax + mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,i,8), %rax + mul v1 + mov X1, -24(rp,i,8) + mov -8(rp,i,8), X1 + add w3, X0 + adc $0, w1 +L(lo1): mov %rdx, w2 + mov X0, -16(rp,i,8) + add %rax, X1 + adc $0, w2 + mov -8(up,i,8), %rax + add w0, X1 + adc $0, w2 + mul v0 +L(lo0): add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,i,8), %rax + mul v1 + add w1, X1 + mov (rp,i,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,i,8), %rax + mul v0 + add w2, X0 + mov X1, -8(rp,i,8) + mov %rdx, w1 + adc $0, w0 +L(lo3): add %rax, X0 + adc $0, w1 + mov (up,i,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,i,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,i,8), %rax + mov X0, (rp,i,8) + mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,i,8), %rax + mov 16(rp,i,8), X0 + adc $0, w3 + add $4, i + jnc L(top) + +L(end): imul v1, %rax + add %rax, X0 + add w1, X1 + adc $0, w3 + add w2, X0 + mov I(-8(up),-16(up,i,8)), %rax + imul v0, %rax + add X0, %rax + mov X1, I(-16(rp),-24(rp,i,8)) + add w3, %rax + mov %rax, I(-8(rp),-16(rp,i,8)) + + add $2, n + lea 16(vp), vp + lea -16(up), up + cmp $-2, n + jl L(outer) + + pop %r15 + pop %r14 + + jnz L(cor0) + +L(cor1):mov (vp), v0 + mov 8(vp), v1 + mov -16(up), %rax + mul v0 C u0 x v2 + add -16(rp), %rax C FIXME: rp[0] still available in reg? + adc -8(rp), %rdx C FIXME: rp[1] still available in reg? + mov -8(up), %r10 + imul v0, %r10 + mov -16(up), %r11 + imul v1, %r11 + mov %rax, -16(rp) + add %r10, %r11 + add %rdx, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(cor0):mov (vp), %r11 + imul -8(up), %r11 + add %rax, %r11 + mov %r11, -8(rp) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + + ALIGN(16) +L(small): + cmp $2, n + jae L(gt1) +L(n1): imul (vp_param), %rax + mov %rax, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp_param), %r9 + mul %r9 + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp_param), %r9 + mul %r9 C u0 x v0 + mov %rax, (rp) + mov %rdx, %r10 + mov 8(up), %rax + mul %r9 C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r11 + mov (up), %rax + mul %r11 C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r11 C u1 x v1 + add %r11, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm new file mode 100644 index 0000000..f0dbe07 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/redc_1.asm @@ -0,0 +1,546 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR 3.24 +C Intel IBR 3.04 +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea 8(mp_param,n,8), mp + lea 8(up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %r10 + mov %rdx, %r11 + add %rax, %r10 + mov (mp,n,8), %rax + adc $0, %r11 + mul q0 + mov %rdx, %r9 + mov (up,n,8), %rbx + add %rax, %rbx + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbx + mov %rbx, -8(up,i,8) C next low remainder limb + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 +L(e1): mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp1) + +L(ed1): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %r10 + mov %rdx, %r11 + add %rax, %r10 + mov (mp,n,8), %rax + adc $0, %r11 + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r9 + add %rax, %rbx + adc $0, %r9 + mov 8(mp,n,8), %rax + mul q0 + mov 8(up,n,8), %r10 + add %r11, %rbx + mov %rdx, %r11 + adc $0, %r9 + mov %rbx, (up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) +L(e3): add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp3) + +L(ed3): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea -8(up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea -8(up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): +L(otp0):lea (n), i + mov -8(mp,n,8), %rax + mul q0 + mov %rdx, %r9 + mov -8(up,n,8), %rbp + add %rax, %rbp + adc $0, %r9 + mov (mp,n,8), %rax + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r11 + add %rax, %rbx + mov 8(mp,n,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,n,8), %rbp + add %r9, %rbx + mov %rdx, %r9 + mov %rbx, (up,n,8) + adc $0, %r11 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) + add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 +L(e0): add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp0) + +L(ed0): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov -8(mp,n,8), %rax + mul q0 + mov -8(up,n,8), %rbp + mov %rdx, %r9 + add %rax, %rbp + adc $0, %r9 + mov (mp,n,8), %rax + mul q0 + mov (up,n,8), %rbx + mov %rdx, %r11 + add %rax, %rbx + mov 8(mp,n,8), %rax + adc $0, %r11 + mul q0 + add %r9, %rbx + mov %rdx, %r9 + mov 8(up,n,8), %rbp + adc $0, %r11 + mov %rbx, (up,n,8) + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): mul q0 + mov -16(up,i,8), %r10 + add %r11, %rbp + mov %rdx, %r11 + adc $0, %r9 + mov %rbp, -24(up,i,8) + add %rax, %r10 + mov -8(mp,i,8), %rax + adc $0, %r11 + mul q0 + add %r9, %r10 + mov %rdx, %r9 + mov -8(up,i,8), %rbp + adc $0, %r11 + mov %r10, -16(up,i,8) +L(e2): add %rax, %rbp + adc $0, %r9 + mov (mp,i,8), %rax + mul q0 + mov (up,i,8), %r10 + add %r11, %rbp + mov %rbp, -8(up,i,8) + adc $0, %r9 + mov %rdx, %r11 + add %rax, %r10 + mov 8(mp,i,8), %rax + adc $0, %r11 + mul q0 + mov 8(up,i,8), %rbp + add %r9, %r10 + mov %rdx, %r9 + mov %r10, (up,i,8) + adc $0, %r11 + add %rax, %rbp + adc $0, %r9 + mov 16(mp,i,8), %rax + add $4, i + jnc L(tp2) + +L(ed2): mul q0 + mov I(-16(up),-16(up,i,8)), %r10 + add %r11, %rbp + adc $0, %r9 + mov %rbp, I(-24(up),-24(up,i,8)) + add %rax, %r10 + adc $0, %rdx + add %r9, %r10 + adc $0, %rdx + mov %r10, I(-16(up),-16(up,i,8)) + mov %rdx, -8(up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -16(up), %rax + adc -8(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -24(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + mov -16(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -24(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + mov -8(up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc (up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -32(mp), %rax + mov -32(up), %r10 + mul q0 + add %rax, %r10 + mov -24(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -24(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -16(mp), %rax + add %r11, %rbp + mov -16(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -24(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -16(up) + mov %r11, -32(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + jmp L(cj) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm new file mode 100644 index 0000000..fd2eaea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rsh1aors_n.asm @@ -0,0 +1,193 @@ +dnl X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Sandy Bridge. + +dnl Copyright 2003, 2005, 2009-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 4.25 +C Intel P4 21.5 +C Intel core2 3.2 +C Intel NHM 3.87 +C Intel SBR 2.05 +C Intel atom ? +C VIA nano 44.9 + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + + neg %r8 C set C flag from parameter + mov (up), %rbp + ADCSBB (vp), %rbp + + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rbp + ADDSUB (vp), %rbp +L(ent): + sbb R32(%rbx), R32(%rbx) C save cy + mov R32(%rbp), R32(%rax) + and $1, R32(%rax) C return value + + mov R32(n), R32(%r11) + and $3, R32(%r11) + + cmp $1, R32(%r11) + je L(do) C jump if n = 1 5 9 ... + +L(n1): cmp $2, R32(%r11) + jne L(n2) C jump unless n = 2 6 10 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r10 + ADCSBB 8(vp), %r10 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r10, %rbp + mov %rbp, -8(rp) + jmp L(cj1) + +L(n2): cmp $3, R32(%r11) + jne L(n3) C jump unless n = 3 7 11 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r9 + mov 16(up), %r10 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r9, %rbp + mov %rbp, -16(rp) + jmp L(cj2) + +L(n3): dec n C come here for n = 4 8 12 ... + add R32(%rbx), R32(%rbx) C restore cy + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + ADCSBB 24(vp), %r10 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r8, %rbp + mov %rbp, -24(rp) + shrd $1, %r9, %r8 + mov %r8, -16(rp) +L(cj2): shrd $1, %r10, %r9 + mov %r9, -8(rp) +L(cj1): mov %r10, %rbp + +L(do): + shr $2, n C 4 + je L(end) C 2 + ALIGN(16) +L(top): add R32(%rbx), R32(%rbx) C restore cy + + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + mov 32(up), %r11 + ADCSBB 24(vp), %r10 + ADCSBB 32(vp), %r11 + + lea 32(up), up + lea 32(vp), vp + + sbb R32(%rbx), R32(%rbx) C save cy + + shrd $1, %r8, %rbp + mov %rbp, (rp) + shrd $1, %r9, %r8 + mov %r8, 8(rp) + shrd $1, %r10, %r9 + mov %r9, 16(rp) + shrd $1, %r11, %r10 + mov %r10, 24(rp) + + dec n + mov %r11, %rbp + lea 32(rp), rp + jne L(top) + +L(end): shrd $1, %rbx, %rbp + mov %rbp, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm new file mode 100644 index 0000000..4c1c0d4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm new file mode 100644 index 0000000..46a3612 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/coreisbr/sqr_basecase.asm @@ -0,0 +1,484 @@ +dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1 +C AMD K8,K9 ? ? ? +C AMD K10 ? ? ? +C AMD bull ? ? ? +C AMD pile ? ? ? +C AMD steam ? ? ? +C AMD bobcat ? ? ? +C AMD jaguar ? ? ? +C Intel P4 ? ? ? +C Intel core ? ? ? +C Intel NHM ? ? ? +C Intel SBR 2.57 2.93 3.0 +C Intel IBR 2.35 2.66 3.0 +C Intel HWL 2.02 2.5 2.5 +C Intel BWL ? ? ? +C Intel atom ? ? ? +C VIA nano ? ? ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund, except +C that the sqr_diag_addlsh1 loop was manually written. + +C TODO +C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy. +C * Streamline pointer updates. +C * Perhaps suppress a few more xor insns in feed-in code. +C * Make sure we write no dead registers in feed-in code. +C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch +C out for negative sizes being zero-extended, though. +C * The straight-line code for n <= 3 comes from the K8 code, and might be +C quite sub-optimal here. Write specific code, and add code for n = 4. +C * The mul_2 loop has a 10 insn common sequence in the loop start and the +C wind-down code. Try re-rolling it. +C * This file has been the subject to just basic micro-optimisation. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, un_param + jae L(gt1) + + mov (up), %rax + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rax + mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, un_param + jae L(gt3) +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w2', `%r11') + + mov (up), %rax + mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret + +L(gt3): + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%rbx') +define(`w3', `%rbp') +define(`un', `%r12') +define(`n', `%rcx') + +define(`X0', `%r13') +define(`X1', `%r14') + +L(do_mul_2): + mov (up), v0 + push %rbx + lea (rp,un_param,8), rp C point rp at R[un] + mov 8(up), %rax + push %rbp + lea (up,un_param,8), up C point up right after U's end + mov %rax, v1 + push %r12 + mov $1, R32(un) C free up rdx + push %r13 + sub un_param, un + push %r14 + push un + mul v0 + mov %rax, (rp,un,8) + mov 8(up,un,8), %rax + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea 2(un), n + xor R32(w1), R32(w1) C FIXME + xor R32(w2), R32(w2) C FIXME + mov %rdx, w0 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + xor R32(w3), R32(w3) C FIXME + xor R32(w0), R32(w0) C FIXME + mov %rdx, w2 + jmp L(m2l1) + + ALIGN(32) +L(m2tp): +L(m2l0):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, -8(rp,n,8) + mov %rdx, w0 + adc $0, w0 + mov (up,n,8), %rax +L(m2l1):mul v0 + add %rax, w2 + mov %rdx, w1 + adc $0, w1 + add w3, w2 + mov (up,n,8), %rax + adc $0, w1 + mul v1 + mov w2, (rp,n,8) + add %rax, w0 + mov %rdx, w2 + mov 8(up,n,8), %rax + adc $0, w2 + add $2, n + jnc L(m2tp) + +L(m2ed):mul v0 + add %rax, w0 + mov %rdx, w3 + adc $0, w3 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + add w1, w0 + adc $0, w3 + add %rax, w2 + mov w0, I(-8(rp),-8(rp,n,8)) + adc $0, %rdx + add w3, w2 + mov w2, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + add $2, un C decrease |un| + +L(do_addmul_2): +L(outer): + lea 16(rp), rp + cmp $-2, R32(un) C jump if un C {-1,0} FIXME jump if un C {-2,1} + jge L(corner) C FIXME: move to before the lea above + + mov -8(up,un,8), v0 + mov (up,un,8), %rax + mov %rax, v1 + mul v0 + test $1, R8(un) + jnz L(a1x1) + +L(a1x0):mov (rp,un,8), X0 + xor w0, w0 + mov 8(rp,un,8), X1 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + xor w2, w2 + mov X0, (rp,un,8) + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(a110) + +L(a100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(lo0) + +L(a110):lea (un), n C un = 2, 6, 10, ... + jmp L(lo2) + +L(a1x1):mov (rp,un,8), X1 + xor w2, w2 + mov 8(rp,un,8), X0 + add %rax, X1 + mov %rdx, w3 + adc $0, w3 + xor w0, w0 + mov 8(up,un,8), %rax + test $2, R8(un) + jz L(a111) + +L(a101):lea 3(un), n C un = 1, 5, 9, ... + jmp L(lo1) + +L(a111):lea 1(un), n C un = 3, 7, 11, ... + jmp L(lo3) + + ALIGN(32) +L(top): mul v1 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + add w1, X1 + adc $0, w3 + add w2, X0 + adc $0, w0 + mov -16(up,n,8), %rax +L(lo1): mul v0 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + mov -16(up,n,8), %rax + mul v1 + mov X1, -24(rp,n,8) + mov -8(rp,n,8), X1 + add w3, X0 + adc $0, w1 + mov %rdx, w2 + mov X0, -16(rp,n,8) + add %rax, X1 + adc $0, w2 + mov -8(up,n,8), %rax + add w0, X1 + adc $0, w2 +L(lo0): mul v0 + add %rax, X1 + mov %rdx, w3 + adc $0, w3 + mov -8(up,n,8), %rax + mul v1 + add w1, X1 + mov (rp,n,8), X0 + adc $0, w3 + mov %rdx, w0 + add %rax, X0 + adc $0, w0 + mov (up,n,8), %rax +L(lo3): mul v0 + add w2, X0 + mov X1, -8(rp,n,8) + mov %rdx, w1 + adc $0, w0 + add %rax, X0 + adc $0, w1 + mov (up,n,8), %rax + add w3, X0 + adc $0, w1 + mul v1 + mov 8(rp,n,8), X1 + add %rax, X1 + mov %rdx, w2 + adc $0, w2 + mov 8(up,n,8), %rax + mov X0, (rp,n,8) +L(lo2): mul v0 + add w0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov 8(up,n,8), %rax + mov 16(rp,n,8), X0 + adc $0, w3 + add $4, n + jnc L(top) + +L(end): mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, I(-8(rp),-24(rp,n,8)) + add w3, %rax + adc $0, %rdx + mov %rax, I((rp),-16(rp,n,8)) + mov %rdx, I(8(rp),-8(rp,n,8)) + + add $2, un C decrease |un| + jmp L(outer) C loop until a small corner remains + +L(corner): + pop n + jg L(small_corner) + + lea 8(rp), rp + mov -24(up), v0 + mov -16(up), %rax + mov %rax, v1 + mul v0 + mov -24(rp), X0 + mov -16(rp), X1 + add %rax, X0 + mov %rdx, w1 + adc $0, w1 + xor w2, w2 + mov X0, -24(rp) + mov -8(up), %rax + mul v0 + add $0, X1 + mov %rdx, w3 + adc $0, w2 + add %rax, X1 + mov -8(up), %rax + adc $0, w3 + mul v1 + add w1, X1 + adc $0, w3 + add w2, %rax + adc $0, %rdx + mov X1, -16(rp) + jmp L(com) + +L(small_corner): + mov -8(rp), w3 + mov -16(up), v0 + mov -8(up), %rax + mul v0 +L(com): add w3, %rax + adc $0, %rdx + mov %rax, -8(rp) + mov %rdx, (rp) + +L(sqr_diag_addlsh1): + mov -8(up,n,8), %rax + shl n + mul %rax + mov %rax, (rp,n,8) + + xor R32(%rbx), R32(%rbx) + mov 8(rp,n,8), %r8 + mov 16(rp,n,8), %r9 + jmp L(dm) + + ALIGN(32) +L(dtop):add %r8, %r10 + adc %r9, %rax + mov 8(rp,n,8), %r8 + mov 16(rp,n,8), %r9 + mov %r10, -8(rp,n,8) + mov %rax, (rp,n,8) +L(dm): adc %r8, %r8 + adc %r9, %r9 + mov (up,n,4), %rax + lea (%rdx,%rbx), %r10 + setc R8(%rbx) + mul %rax + add $2, n + js L(dtop) + +L(dend):add %r8, %r10 + adc %r9, %rax + mov %r10, I(-8(rp),-8(rp,n,8)) + mov %rax, I((rp),(rp,n,8)) + adc %rbx, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/darwin.m4 b/gmp-6.3.0/mpn/x86_64/darwin.m4 new file mode 100644 index 0000000..7771476 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/darwin.m4 @@ -0,0 +1,82 @@ +divert(-1) +dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`DARWIN') + +define(`LEA',`dnl +ifdef(`PIC', + `lea $1(%rip), $2' +, + `movabs `$'$1, $2') +') + +dnl Usage: CALL(funcname) +dnl +dnl Simply override the definition in x86_64-defs.m4. + +define(`CALL',`call GSYM_PREFIX`'$1') +define(`TCALL',`jmp GSYM_PREFIX`'$1') + + +dnl Usage: JUMPTABSECT +dnl +dnl CAUTION: Do not put anything sensible here, like RODATA. That works with +dnl some Darwin tool chains, but silently breaks with other. (Note that +dnl putting jump tables in the text segment is a really poor idea for many PC +dnl processors, since they cannot cache the same thing in both L1D and L2I.) + +define(`JUMPTABSECT', `.text') + + +dnl Usage: JMPENT(targlabel,tablabel) + +define(`JMPENT',`dnl +ifdef(`PIC', + `.set $1_tmp, $1-$2 + .long $1_tmp' +, + `.quad $1' +)') + +dnl Target ABI macros. For Darwin we override IFELF (and leave default for +dnl IFDOS and IFSTD). + +define(`IFELF', `') + + +dnl Usage: PROTECT(symbol) +dnl +dnl Used for private GMP symbols that should never be overridden by users. +dnl This can save reloc entries and improve shlib sharing as well as +dnl application startup times + +define(`PROTECT', `.private_extern $1') + + +divert`'dnl diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm new file mode 100644 index 0000000..b3d45e2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/div_qr_1n_pi1.asm @@ -0,0 +1,247 @@ +dnl x86-64 mpn_div_qr_1n_pi1 +dnl -- Divide an mpn number by a normalized single-limb number, +dnl using a single-limb inverse. + +dnl Contributed to the GNU project by Niels Möller + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C AMD K8,K9 13 +C AMD K10 13 +C AMD bull 16.5 +C AMD pile 15 +C AMD steam ? +C AMD bobcat 16 +C AMD jaguar ? +C Intel P4 47 poor +C Intel core 19.25 +C Intel NHM 18 +C Intel SBR 15 poor +C Intel IBR 13 +C Intel HWL 11.7 +C Intel BWL ? +C Intel atom 52 very poor +C VIA nano 19 + + +C INPUT Parameters +define(`QP', `%rdi') +define(`UP', `%rsi') +define(`UN_INPUT', `%rdx') +define(`U1', `%rcx') C Also in %rax +define(`D', `%r8') +define(`DINV', `%r9') + +C Invariants +define(`B2', `%rbp') +define(`B2md', `%rbx') + +C Variables +define(`UN', `%r8') C Overlaps D input +define(`T', `%r10') +define(`U0', `%r11') +define(`U2', `%r12') +define(`Q0', `%r13') +define(`Q1', `%r14') +define(`Q2', `%r15') + +ABI_SUPPORT(STD64) + + ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_div_qr_1n_pi1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + dec UN_INPUT + jnz L(first) + + C Just a single 2/1 division. + C T, U0 are allocated in scratch registers + lea 1(U1), T + mov U1, %rax + mul DINV + mov (UP), U0 + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(single_div_done) + sub D, %rax + add $1, T +L(single_div_done): + mov T, (QP) + FUNC_EXIT() + ret +L(first): + C FIXME: Could delay some of these until we enter the loop. + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + + mov D, B2 + imul DINV, B2 + neg B2 + mov B2, B2md + sub D, B2md + + C D not needed until final reduction + push D + mov UN_INPUT, UN C Clobbers D + + mov DINV, %rax + mul U1 + mov %rax, Q0 + add U1, %rdx + mov %rdx, T + + mov B2, %rax + mul U1 + mov -8(UP, UN, 8), U0 + mov (UP, UN, 8), U1 + mov T, (QP, UN, 8) + add %rax, U0 + adc %rdx, U1 + sbb U2, U2 + dec UN + mov U1, %rax + jz L(final) + + ALIGN(16) + + C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles. + C At entry, %rax holds an extra copy of U1 +L(loop): + C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2 + C Remains to add in B (U1 + c) + mov DINV, Q1 + mov U2, Q2 + and U2, Q1 + neg Q2 + mul DINV + add %rdx, Q1 + adc $0, Q2 + add Q0, Q1 + mov %rax, Q0 + mov B2, %rax + lea (B2md, U0), T + adc $0, Q2 + + C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u + mul U1 + and B2, U2 + add U2, U0 + cmovnc U0, T + + C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c + adc U1, Q1 + mov -8(UP, UN, 8), U0 + adc Q2, 8(QP, UN, 8) + jc L(q_incr) +L(q_incr_done): + add %rax, U0 + mov T, %rax + adc %rdx, %rax + mov Q1, (QP, UN, 8) + sbb U2, U2 + dec UN + mov %rax, U1 + jnz L(loop) + +L(final): + pop D + + mov U2, Q1 + and D, U2 + sub U2, %rax + neg Q1 + + mov %rax, U1 + sub D, %rax + cmovc U1, %rax + sbb $-1, Q1 + + lea 1(%rax), T + mul DINV + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(div_done) + sub D, %rax + add $1, T +L(div_done): + add T, Q0 + mov Q0, (QP) + adc Q1, 8(QP) + jnc L(done) +L(final_q_incr): + addq $1, 16(QP) + lea 8(QP), QP + jc L(final_q_incr) + +L(done): + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(q_incr): + C U1 is not live, so use it for indexing + lea 16(QP, UN, 8), U1 +L(q_incr_loop): + addq $1, (U1) + jnc L(q_incr_done) + lea 8(U1), U1 + jmp L(q_incr_loop) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm new file mode 100644 index 0000000..5e59a0a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm @@ -0,0 +1,158 @@ +dnl x86-64 mpn_div_qr_2n_pi1 +dnl -- Divide an mpn number by a normalized 2-limb number, +dnl using a single-limb inverse. + +dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`rp', `%rsi') +define(`up_param', `%rdx') +define(`un', `%rcx') +define(`d1', `%r8') +define(`d0', `%r9') +define(`di_param', `8(%rsp)') + +define(`di', `%r10') +define(`up', `%r11') +define(`u2', `%rbx') +define(`u1', `%r12') +define(`t1', `%r13') +define(`t0', `%r14') +define(`md1', `%r15') + +C TODO +C * Store qh in the same stack slot as di_param, instead of pushing +C it. (we could put it in register %rbp, but then we would need to +C save and restore that instead, which doesn't seem like a win). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_div_qr_2n_pi1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') +IFDOS(`define(`di_param', `72(%rsp)')') + mov di_param, di + mov up_param, up + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + + mov -16(up, un, 8), u1 + mov -8(up, un, 8), u2 + + mov u1, t0 + mov u2, t1 + sub d0, t0 + sbb d1, t1 + cmovnc t0, u1 + cmovnc t1, u2 + C push qh which is !carry + sbb %rax, %rax + inc %rax + push %rax + lea -2(un), un + mov d1, md1 + neg md1 + + jmp L(next) + + ALIGN(16) +L(loop): + C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di) + C Based on the optimized divrem_2.asm code. + + mov di, %rax + mul u2 + mov u1, t0 + add %rax, t0 C q0 in t0 + adc u2, %rdx + mov %rdx, t1 C q in t1 + imul md1, %rdx + mov d0, %rax + lea (%rdx, u1), u2 + mul t1 + mov (up, un, 8), u1 + sub d0, u1 + sbb d1, u2 + sub %rax, u1 + sbb %rdx, u2 + xor R32(%rax), R32(%rax) + xor R32(%rdx), R32(%rdx) + cmp t0, u2 + cmovnc d0, %rax + cmovnc d1, %rdx + adc $0, t1 + nop + add %rax, u1 + adc %rdx, u2 + cmp d1, u2 + jae L(fix) +L(bck): + mov t1, (qp, un, 8) +L(next): + sub $1, un + jnc L(loop) +L(end): + mov u2, 8(rp) + mov u1, (rp) + + C qh on stack + pop %rax + + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(fix): C Unlikely update. u2 >= d1 + seta %dl + cmp d0, u1 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck) + inc t1 + sub d0, u1 + sbb d1, u2 + jmp L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm new file mode 100644 index 0000000..85af96f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm @@ -0,0 +1,200 @@ +dnl x86-64 mpn_div_qr_2u_pi1 +dnl -- Divide an mpn number by an unnormalized 2-limb number, +dnl using a single-limb inverse and shifting the dividend on the fly. + +dnl Copyright 2007, 2008, 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`rp', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') dnl %rcx needed for shift count +define(`d1', `%r8') +define(`d0', `%r9') +define(`shift_param', `FRAME+8(%rsp)') +define(`di_param', `FRAME+16(%rsp)') + +define(`di', `%r10') +define(`up', `%r11') +define(`un', `%rbp') +define(`u2', `%rbx') +define(`u1', `%r12') +define(`u0', `%rsi') dnl Same as rp, which is saved and restored. +define(`t1', `%r13') +define(`t0', `%r14') +define(`md1', `%r15') + +ASM_START() + TEXT + ALIGN(16) +deflit(`FRAME', 0) +PROLOGUE(mpn_div_qr_2u_pi1) + mov di_param, di + mov up_param, up + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + push rp +deflit(`FRAME', 56) + lea -2(un_param), un + mov d1, md1 + neg md1 + + C int parameter, 32 bits only + movl shift_param, R32(%rcx) + + C FIXME: Different code for SHLD_SLOW + + xor R32(u2), R32(u2) + mov 8(up, un, 8), u1 + shld %cl, u1, u2 + C Remains to read (up, un, 8) and shift u1, u0 + C udiv_qr_3by2 (qh,u2,u1,u2,u1,n0, d1,d0,di) + mov di, %rax + mul u2 + mov (up, un, 8), u0 + shld %cl, u0, u1 + mov u1, t0 + add %rax, t0 C q0 in t0 + adc u2, %rdx + mov %rdx, t1 C q in t1 + imul md1, %rdx + mov d0, %rax + lea (%rdx, u1), u2 + mul t1 + mov u0, u1 + shl %cl, u1 + sub d0, u1 + sbb d1, u2 + sub %rax, u1 + sbb %rdx, u2 + xor R32(%rax), R32(%rax) + xor R32(%rdx), R32(%rdx) + cmp t0, u2 + cmovnc d0, %rax + cmovnc d1, %rdx + adc $0, t1 + nop + add %rax, u1 + adc %rdx, u2 + cmp d1, u2 + jae L(fix_qh) +L(bck_qh): + push t1 C push qh on stack + + jmp L(next) + + ALIGN(16) +L(loop): + C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di) + C Based on the optimized divrem_2.asm code. + + mov di, %rax + mul u2 + mov (up, un, 8), u0 + xor R32(t1), R32(t1) + shld %cl, u0, t1 + or t1, u1 + mov u1, t0 + add %rax, t0 C q0 in t0 + adc u2, %rdx + mov %rdx, t1 C q in t1 + imul md1, %rdx + mov d0, %rax + lea (%rdx, u1), u2 + mul t1 + mov u0, u1 + shl %cl, u1 + sub d0, u1 + sbb d1, u2 + sub %rax, u1 + sbb %rdx, u2 + xor R32(%rax), R32(%rax) + xor R32(%rdx), R32(%rdx) + cmp t0, u2 + cmovnc d0, %rax + cmovnc d1, %rdx + adc $0, t1 + nop + add %rax, u1 + adc %rdx, u2 + cmp d1, u2 + jae L(fix) +L(bck): + mov t1, (qp, un, 8) +L(next): + sub $1, un + jnc L(loop) +L(end): + C qh on stack + pop %rax + pop rp + shrd %cl, u2, u1 + shr %cl, u2 + mov u2, 8(rp) + mov u1, (rp) + + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + ret + +L(fix): C Unlikely update. u2 >= d1 + seta %dl + cmp d0, u1 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck) + inc t1 + sub d0, u1 + sbb d1, u2 + jmp L(bck) + +C Duplicated, just jumping back to a different address. +L(fix_qh): C Unlikely update. u2 >= d1 + seta %dl + cmp d0, u1 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck_qh) + inc t1 + sub d0, u1 + sbb d1, u2 + jmp L(bck_qh) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/dive_1.asm b/gmp-6.3.0/mpn/x86_64/dive_1.asm new file mode 100644 index 0000000..988bdab --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/dive_1.asm @@ -0,0 +1,158 @@ +dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 10 +C AMD K10 10 +C Intel P4 33 +C Intel core2 13.25 +C Intel corei 14 +C Intel atom 42 +C VIA nano 43 + +C A quick adoption of the 32-bit K7 code. + + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx +C divisor rcx + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_divexact_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C shift count + mov %rdx, %r8 + + bt $0, R32(%rax) + jnc L(evn) C skip bsfq unless divisor is even + +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r10 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits + + lea (%rsi,%r8,8), %rsi C up end + lea -8(%rdi,%r8,8), %rdi C rp end + neg %r8 C -n + + mov (%rsi,%r8,8), %rax C up[0] + + inc %r8 + jz L(one) + + mov (%rsi,%r8,8), %rdx C up[1] + + shrd R8(%rcx), %rdx, %rax + + xor R32(%rbx), R32(%rbx) + jmp L(ent) + +L(evn): bsf %rax, %rcx + shr R8(%rcx), %rax + jmp L(odd) + + ALIGN(8) +L(top): + C rax q + C rbx carry bit, 0 or 1 + C rcx shift + C rdx + C rsi up end + C rdi rp end + C r8 counter, limbs, negative + C r10 d^(-1) mod 2^64 + C r11 d, shifted down + + mul %r11 C carry limb in rdx 0 10 + mov -8(%rsi,%r8,8), %rax C + mov (%rsi,%r8,8), %r9 C + shrd R8(%rcx), %r9, %rax C + nop C + sub %rbx, %rax C apply carry bit + setc %bl C + sub %rdx, %rax C apply carry limb 5 + adc $0, %rbx C 6 +L(ent): imul %r10, %rax C 6 + mov %rax, (%rdi,%r8,8) C + inc %r8 C + jnz L(top) + + mul %r11 C carry limb in rdx + mov -8(%rsi), %rax C up high limb + shr R8(%rcx), %rax + sub %rbx, %rax C apply carry bit + sub %rdx, %rax C apply carry limb + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + FUNC_EXIT() + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/divrem_1.asm b/gmp-6.3.0/mpn/x86_64/divrem_1.asm new file mode 100644 index 0000000..d4d61ad --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/divrem_1.asm @@ -0,0 +1,314 @@ +dnl x86-64 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 2004, 2005, 2007-2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C norm unorm frac +C AMD K8,K9 13 13 12 +C AMD K10 13 13 12 +C Intel P4 43 44 43 +C Intel core2 24.5 24.5 19.5 +C Intel corei 20.5 19.5 18 +C Intel atom 43 46 36 +C VIA nano 25.5 25.5 24 + +C mp_limb_t +C mpn_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d) + +C mp_limb_t +C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn, +C mp_srcptr np, mp_size_t nn, mp_limb_t d, +C mp_limb_t dinv, int cnt) + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`fn_param', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') +define(`d', `%r8') +define(`dinv', `%r9') C only for mpn_preinv_divrem_1 +C shift passed on stack C only for mpn_preinv_divrem_1 + +define(`cnt', `%rcx') +define(`up', `%rsi') +define(`fn', `%r12') +define(`un', `%rbx') + + +C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C cnt qp d dinv + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFSTD(`define(`CNTOFF', `40($1)')') +IFDOS(`define(`CNTOFF', `104($1)')') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + + lea -8(qp,un_param,8), qp + + test d, d + js L(nent) + + mov CNTOFF(%rsp), R8(cnt) + shl R8(cnt), d + jmp L(uent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_divrem_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + xor R32(%rax), R32(%rax) + push %r13 + push %r12 + push %rbp + push %rbx + + mov fn_param, fn + mov un_param, un + add fn_param, un_param + mov up_param, up + je L(ret) + + lea -8(qp,un_param,8), qp + xor R32(%rbp), R32(%rbp) + + test d, d + jns L(unnormalized) + +L(normalized): + test un, un + je L(8) C un == 0 + mov -8(up,un,8), %rbp + dec un + mov %rbp, %rax + sub d, %rbp + cmovc %rax, %rbp + sbb R32(%rax), R32(%rax) + inc R32(%rax) + mov %rax, (qp) + lea -8(qp), qp +L(8): +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %r8 +IFSTD(` mov d, %rdi ') +IFDOS(` sub $32, %rsp ') +IFDOS(` mov d, %rcx ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + pop %r8 +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') + + mov %rax, dinv + mov %rbp, %rax + jmp L(nent) + + ALIGN(16) +L(ntop):mov (up,un,8), %r10 C K8-K10 P6-CNR P6-NHM P4 + mul dinv C 0,13 0,20 0,18 0,45 + add %r10, %rax C 4 8 3 12 + adc %rbp, %rdx C 5 9 10 13 + mov %rax, %rbp C 5 9 4 13 + mov %rdx, %r13 C 6 11 12 23 + imul d, %rdx C 6 11 11 23 + sub %rdx, %r10 C 10 16 14 33 + mov d, %rax C + add %r10, %rax C 11 17 15 34 + cmp %rbp, %r10 C 11 17 15 34 + cmovc %r10, %rax C 12 18 16 35 + adc $-1, %r13 C + cmp d, %rax C + jae L(nfx) C +L(nok): mov %r13, (qp) C + sub $8, qp C +L(nent):lea 1(%rax), %rbp C + dec un C + jns L(ntop) C + + xor R32(%rcx), R32(%rcx) + jmp L(frac) + +L(nfx): sub d, %rax + inc %r13 + jmp L(nok) + +L(unnormalized): + test un, un + je L(44) + mov -8(up,un,8), %rax + cmp d, %rax + jae L(44) + mov %rbp, (qp) + mov %rax, %rbp + lea -8(qp), qp + je L(ret) + dec un +L(44): + bsr d, %rcx + not R32(%rcx) + shl R8(%rcx), d + shl R8(%rcx), %rbp + + push %rcx +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %r8 +IFSTD(` sub $8, %rsp ') +IFSTD(` mov d, %rdi ') +IFDOS(` sub $40, %rsp ') +IFDOS(` mov d, %rcx ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + pop %r8 +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') + pop %rcx + + mov %rax, dinv + mov %rbp, %rax + test un, un + je L(frac) + +L(uent):dec un + mov (up,un,8), %rbp + neg R32(%rcx) + shr R8(%rcx), %rbp + neg R32(%rcx) + or %rbp, %rax + jmp L(ent) + + ALIGN(16) +L(utop):mov (up,un,8), %r10 + shl R8(%rcx), %rbp + neg R32(%rcx) + shr R8(%rcx), %r10 + neg R32(%rcx) + or %r10, %rbp + mul dinv + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul d, %rdx + sub %rdx, %rbp + mov d, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp d, %rax + jae L(ufx) +L(uok): mov %r13, (qp) + sub $8, qp +L(ent): mov (up,un,8), %rbp + dec un + lea 1(%rax), %r11 + jns L(utop) + +L(uend):shl R8(%rcx), %rbp + mul dinv + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul d, %rdx + sub %rdx, %rbp + mov d, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp d, %rax + jae L(efx) +L(eok): mov %r13, (qp) + sub $8, qp + jmp L(frac) + +L(ufx): sub d, %rax + inc %r13 + jmp L(uok) +L(efx): sub d, %rax + inc %r13 + jmp L(eok) + +L(frac):mov d, %rbp + neg %rbp + jmp L(fent) + + ALIGN(16) C K8-K10 P6-CNR P6-NHM P4 +L(ftop):mul dinv C 0,12 0,17 0,17 + add %r11, %rdx C 5 8 10 + mov %rax, %r11 C 4 8 3 + mov %rdx, %r13 C 6 9 11 + imul %rbp, %rdx C 6 9 11 + mov d, %rax C + add %rdx, %rax C 10 14 14 + cmp %r11, %rdx C 10 14 14 + cmovc %rdx, %rax C 11 15 15 + adc $-1, %r13 C + mov %r13, (qp) C + sub $8, qp C +L(fent):lea 1(%rax), %r11 C + dec fn C + jns L(ftop) C + + shr R8(%rcx), %rax +L(ret): pop %rbx + pop %rbp + pop %r12 + pop %r13 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/divrem_2.asm b/gmp-6.3.0/mpn/x86_64/divrem_2.asm new file mode 100644 index 0000000..20811cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/divrem_2.asm @@ -0,0 +1,192 @@ +dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. + +dnl Copyright 2007, 2008, 2010, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb best +C AMD K8,K9 18 +C AMD K10 18 +C AMD bull +C AMD pile +C AMD bobcat +C AMD jaguar +C Intel P4 68 +C Intel core 34 +C Intel NHM 30.25 +C Intel SBR 21.3 +C Intel IBR 21.4 +C Intel HWL 20.6 +C Intel BWL +C Intel atom 73 +C VIA nano 33 + + +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`fn', `%rsi') +define(`up_param', `%rdx') +define(`un_param', `%rcx') +define(`dp', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_divrem_2) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %r15 + push %r14 + push %r13 + push %r12 + lea -24(%rdx,%rcx,8), %r12 C r12 = &up[un-1] + mov %rsi, %r13 + push %rbp + mov %rdi, %rbp + push %rbx + mov 8(%r8), %r11 C d1 + mov 16(%r12), %rbx + mov (%r8), %r8 C d0 + mov 8(%r12), %r10 + + xor R32(%r15), R32(%r15) + cmp %rbx, %r11 + ja L(2) + setb %dl + cmp %r10, %r8 + setbe %al + orb %al, %dl C "orb" form to placate Sun tools + je L(2) + inc R32(%r15) + sub %r8, %r10 + sbb %r11, %rbx +L(2): + lea -3(%rcx,%r13), %r14 C un + fn - 3 + test %r14, %r14 + js L(end) + + push %r8 + push %r10 + push %r11 +IFSTD(` mov %r11, %rdi ') +IFDOS(` mov %r11, %rcx ') +IFDOS(` sub $32, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + pop %r11 + pop %r10 + pop %r8 + + mov %r11, %rdx + mov %rax, %rdi + imul %rax, %rdx + mov %rdx, %r9 + mul %r8 + xor R32(%rcx), R32(%rcx) + add %r8, %r9 + adc $-1, %rcx + add %rdx, %r9 + adc $0, %rcx + js 2f +1: dec %rdi + sub %r11, %r9 + sbb $0, %rcx + jns 1b +2: + + lea (%rbp,%r14,8), %rbp + mov %r11, %rsi + neg %rsi C -d1 + +C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C n2 un -d1 dinv qp d0 q0 d1 up fn msl + + ALIGN(16) +L(top): mov %rdi, %rax C di ncp + mul %rbx C 0, 17 + mov %r10, %rcx C + add %rax, %rcx C 4 + adc %rbx, %rdx C 5 + mov %rdx, %r9 C q 6 + imul %rsi, %rdx C 6 + mov %r8, %rax C ncp + lea (%rdx, %r10), %rbx C n1 -= ... 10 + xor R32(%r10), R32(%r10) C + mul %r9 C 7 + cmp %r14, %r13 C + jg L(19) C + mov (%r12), %r10 C + sub $8, %r12 C +L(19): sub %r8, %r10 C ncp + sbb %r11, %rbx C 11 + sub %rax, %r10 C 11 + sbb %rdx, %rbx C 12 + xor R32(%rax), R32(%rax) C + xor R32(%rdx), R32(%rdx) C + cmp %rcx, %rbx C 13 + cmovnc %r8, %rax C 14 + cmovnc %r11, %rdx C 14 + adc $0, %r9 C adjust q 14 + nop + add %rax, %r10 C 15 + adc %rdx, %rbx C 16 + cmp %r11, %rbx C + jae L(fix) C +L(bck): mov %r9, (%rbp) C + sub $8, %rbp C + dec %r14 + jns L(top) + +L(end): mov %r10, 8(%r12) + mov %rbx, 16(%r12) + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + mov %r15, %rax + pop %r15 + FUNC_EXIT() + ret + +L(fix): seta %dl + cmp %r8, %r10 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck) + inc %r9 + sub %r8, %r10 + sbb %r11, %rbx + jmp L(bck) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/dos64.m4 b/gmp-6.3.0/mpn/x86_64/dos64.m4 new file mode 100644 index 0000000..0da1b36 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/dos64.m4 @@ -0,0 +1,101 @@ +divert(-1) +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +define(`HOST_DOS64') + + +dnl On DOS64 we always generate position-independent-code +dnl + +define(`PIC') + + +define(`LEA',` + lea $1(%rip), $2 +') + + +dnl Usage: CALL(funcname) +dnl +dnl Simply override the definition in x86_64-defs.m4. + +define(`CALL',`call GSYM_PREFIX`'$1') +define(`TCALL',`jmp GSYM_PREFIX`'$1') + + +dnl Usage: JUMPTABSECT + +define(`JUMPTABSECT', `RODATA') + + +dnl Usage: JMPENT(targlabel,tablabel) + +define(`JMPENT', `.long $1-$2') + + +dnl Usage: FUNC_ENTRY(nregparmas) +dnl Usage: FUNC_EXIT() + +dnl FUNC_ENTRY and FUNC_EXIT provide an easy path for adoption of standard +dnl ABI assembly to the DOS64 ABI. + +define(`FUNC_ENTRY', + `push %rdi + push %rsi + mov %rcx, %rdi +ifelse(eval($1>=2),1,`dnl + mov %rdx, %rsi +ifelse(eval($1>=3),1,`dnl + mov %r8, %rdx +ifelse(eval($1>=4),1,`dnl + mov %r9, %rcx +')')')') + +define(`FUNC_EXIT', + `pop %rsi + pop %rdi') + + +dnl Target ABI macros. For DOS64 we override the defaults. + +define(`IFDOS', `$1') +define(`IFSTD', `') +define(`IFELF', `') + + +dnl Usage: PROTECT(symbol) +dnl +dnl Used for private GMP symbols that should never be overridden by users. +dnl This can save reloc entries and improve shlib sharing as well as +dnl application startup times + +define(`PROTECT', `') + + +divert`'dnl diff --git a/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm b/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm new file mode 100644 index 0000000..21ab210 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm @@ -0,0 +1,181 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast AVX. + +dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 4.87 4.87 N +C AMD bd3 ? ? +C AMD bd4 0.53 ? +C AMD zn1 0.51 ? +C AMD zn2 0.25 ? Y +C AMD zn3 0.25 ? Y +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel WSM n/a +C Intel SBR 0.50 0.91 N +C Intel IBR 0.50 0.65 N +C Intel HWL 0.25 0.30 Y +C Intel BWL 0.28 0.37 Y +C Intel SKL 0.27 ? Y +C Intel atom n/a +C Intel SLM n/a +C Intel GLM n/a +C VIA nano n/a + +C We try to do as many 32-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. For the bulk copying, we +C write using aligned 32-byte operations, but we read with both aligned and +C unaligned 32-byte operations. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`vmovdqu', vlddqu) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + lea -32(rp,n,8), rp + lea -32(up,n,8), up + + cmp $7, n C basecase needed for correctness + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(a2) C jump if rp aligned + mov 24(up), %rax + lea -8(up), up + mov %rax, 24(rp) + lea -8(rp), rp + dec n +L(a2): test $16, R8(rp) C is rp 32-byte aligned? + jz L(a3) C jump if rp aligned + vmovdqu 16(up), %xmm0 + lea -16(up), up + vmovdqa %xmm0, 16(rp) + lea -16(rp), rp + sub $2, n +L(a3): sub $16, n + jc L(sma) + + ALIGN(16) +L(top): vmovdqu (up), %ymm0 + vmovdqu -32(up), %ymm1 + vmovdqu -64(up), %ymm2 + vmovdqu -96(up), %ymm3 + lea -128(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, -32(rp) + vmovdqa %ymm2, -64(rp) + vmovdqa %ymm3, -96(rp) + lea -128(rp), rp +L(ali): sub $16, n + jnc L(top) + +L(sma): test $8, R8(n) + jz 1f + vmovdqu (up), %ymm0 + vmovdqu -32(up), %ymm1 + lea -64(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, -32(rp) + lea -64(rp), rp +1: + test $4, R8(n) + jz 1f + vmovdqu (up), %ymm0 + lea -32(up), up + vmovdqa %ymm0, (rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + vmovdqu 16(up), %xmm0 + lea -16(up), up + vmovdqa %xmm0, 16(rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 24(up), %r8 + mov %r8, 24(rp) +1: + FUNC_EXIT() + ret + + ALIGN(16) +L(bc): test $4, R8(n) + jz 1f + mov 24(up), %rax + mov 16(up), %rcx + mov 8(up), %r8 + mov (up), %r9 + lea -32(up), up + mov %rax, 24(rp) + mov %rcx, 16(rp) + mov %r8, 8(rp) + mov %r9, (rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + mov 24(up), %rax + mov 16(up), %rcx + lea -16(up), up + mov %rax, 24(rp) + mov %rcx, 16(rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 24(up), %rax + mov %rax, 24(rp) +1: + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm b/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm new file mode 100644 index 0000000..03c2440 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm @@ -0,0 +1,178 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast AVX. + +dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 4.87 4.87 N +C AMD bd3 ? ? +C AMD bd4 0.53 ? +C AMD zn1 0.51 ? +C AMD zn2 0.25 ? Y +C AMD zn3 0.25 ? Y +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel WSM n/a +C Intel SBR 0.50 0.91 N +C Intel IBR 0.50 0.65 N +C Intel HWL 0.25 0.30 Y +C Intel BWL 0.28 0.37 Y +C Intel SKL 0.27 ? Y +C Intel atom n/a +C Intel SLM n/a +C Intel GLM n/a +C VIA nano n/a + +C We try to do as many 32-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. For the bulk copying, we +C write using aligned 32-byte operations, but we read with both aligned and +C unaligned 32-byte operations. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`vmovdqu', vlddqu) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $7, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(a2) C jump if rp aligned + mov (up), %rax + lea 8(up), up + mov %rax, (rp) + lea 8(rp), rp + dec n +L(a2): test $16, R8(rp) C is rp 32-byte aligned? + jz L(a3) C jump if rp aligned + vmovdqu (up), %xmm0 + lea 16(up), up + vmovdqa %xmm0, (rp) + lea 16(rp), rp + sub $2, n +L(a3): sub $16, n + jc L(sma) + + ALIGN(16) +L(top): vmovdqu (up), %ymm0 + vmovdqu 32(up), %ymm1 + vmovdqu 64(up), %ymm2 + vmovdqu 96(up), %ymm3 + lea 128(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, 32(rp) + vmovdqa %ymm2, 64(rp) + vmovdqa %ymm3, 96(rp) + lea 128(rp), rp +L(ali): sub $16, n + jnc L(top) + +L(sma): test $8, R8(n) + jz 1f + vmovdqu (up), %ymm0 + vmovdqu 32(up), %ymm1 + lea 64(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, 32(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + vmovdqu (up), %ymm0 + lea 32(up), up + vmovdqa %ymm0, (rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + vmovdqu (up), %xmm0 + lea 16(up), up + vmovdqa %xmm0, (rp) + lea 16(rp), rp +1: +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) +1: + FUNC_EXIT() + ret + + ALIGN(16) +L(bc): test $4, R8(n) + jz 1f + mov (up), %rax + mov 8(up), %rcx + mov 16(up), %r8 + mov 24(up), %r9 + lea 32(up), up + mov %rax, (rp) + mov %rcx, 8(rp) + mov %r8, 16(rp) + mov %r9, 24(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + mov (up), %rax + mov 8(up), %rcx + lea 16(up), up + mov %rax, (rp) + mov %rcx, 8(rp) + lea 16(rp), rp +1: + test $1, R8(n) + jz 1f + mov (up), %rax + mov %rax, (rp) +1: + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/README b/gmp-6.3.0/mpn/x86_64/fastsse/README new file mode 100644 index 0000000..5538b2d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/README @@ -0,0 +1,22 @@ +This directory contains code for x86-64 processors with fast +implementations of SSE operations, hence the name "fastsse". + +Current processors that might benefit from this code are: + + AMD K10 + AMD Bulldozer/Piledriver/Steamroller/Excavator + Intel Nocona + Intel Nehalem/Westmere + Intel Sandybridge/Ivybridge + Intel Haswell/Broadwell + VIA Nano + +Current processors that do not benefit from this code are: + + AMD K8 + AMD Bobcat + Intel Atom + +Intel Conroe/Penryn is a border case; its handling of non-aligned +128-bit memory operands is poor. VIA Nano also have poor handling of +non-aligned operands. diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm new file mode 100644 index 0000000..69027bc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm @@ -0,0 +1,311 @@ +dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bd1 1.39 ? 1.45 Y/N +C AMD bd2 0.8-1.4 0.7-1.4 Y +C AMD bd3 +C AMD bd4 +C AMD bobcat 1.97 ? 8.17 1.5/1.5 N +C AMD jaguar 1.02 1.02 0.91/0.91 N +C Intel P4 2.26 illop Y/N +C Intel core 0.58 0.87 opt/0.74 Y +C Intel NHM 0.64 1.14 opt/bad Y +C Intel SBR 0.51 0.65 opt/opt Y +C Intel IBR 0.50 0.64 opt/0.57 Y +C Intel HWL 0.51 0.58 opt/opt Y +C Intel BWL 0.52 0.64 opt/opt Y +C Intel SKL 0.51 0.63 opt/opt Y +C Intel atom 1.16 1.70 opt/opt Y +C Intel SLM 1.02 1.52 N +C VIA nano 1.09 1.10 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That +C instruction is better adapted to mpn_copyd's needs, we need to contort the +C code to use it here. +C +C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken +C from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +define(`movdqa', ``movaps'') + +ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_com) + FUNC_ENTRY(3) + + cmp $COM_SSE_THRESHOLD, n + jbe L(bc) + + pcmpeqb %xmm5, %xmm5 C set to 111...111 + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(rp_aligned) C jump if rp aligned + + mov (up), %r8 + lea 8(up), up + not %r8 + mov %r8, (rp) + lea 8(rp), rp + dec n + +L(rp_aligned): + test $8, R8(up) + jnz L(uent) + +ifelse(eval(COM_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa 0(up), %xmm0 + movdqa 16(up), %xmm1 + movdqa 32(up), %xmm2 + movdqa 48(up), %xmm3 + lea 64(up), up + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm5, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa (up), %xmm0 + movdqa 16(up), %xmm1 + lea 32(up), up + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa (up), %xmm0 + lea 16(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent): +C Code handling up - rp = 8 (mod 16) + +C FIXME: The code below only handles overlap if it is close to complete, or +C quite separate: up-rp < 5 or up-up > 15 limbs + lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES + sub rp, %rax + cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES + jbe L(bc) C deflect to plain loop + + sub $16, n + jc L(uend) + + movdqa 120(up), %xmm3 + + sub $16, n + jmp L(um) + + ALIGN(16) +L(utop):movdqa 120(up), %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm0, -128(rp) + sub $16, n +L(um): movdqa 104(up), %xmm2 + palignr($8, %xmm2, %xmm3) + movdqa 88(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 112(rp) + palignr($8, %xmm1, %xmm2) + movdqa 72(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 96(rp) + palignr($8, %xmm0, %xmm1) + movdqa 56(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 80(rp) + palignr($8, %xmm3, %xmm0) + movdqa 40(up), %xmm2 + pxor %xmm5, %xmm0 + movdqa %xmm0, 64(rp) + palignr($8, %xmm2, %xmm3) + movdqa 24(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa 8(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 128(up), up + lea 128(rp), rp + jnc L(utop) + + pxor %xmm5, %xmm0 + movdqa %xmm0, -128(rp) + +L(uend):test $8, R8(n) + jz 1f + movdqa 56(up), %xmm3 + movdqa 40(up), %xmm2 + palignr($8, %xmm2, %xmm3) + movdqa 24(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa 8(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 64(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 64(rp), rp + +1: test $4, R8(n) + jz 1f + movdqa 24(up), %xmm1 + movdqa 8(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 32(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa 8(up), %xmm0 + movdqa -8(up), %xmm3 + palignr($8, %xmm3, %xmm0) + lea 16(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): lea -8(rp), rp + sub $4, R32(n) + jc L(end) + +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` ALIGN(16)') +L(top): mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + not %r8 + not %r9 + not %r10 + not %r11 + mov %r8, -24(rp) + mov %r9, -16(rp) +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, -8(rp) + mov %r11, (rp) +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, 8(rp) + lea 8(rp), rp + lea 8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov 8(up), %r9 + not %r8 + not %r9 + mov %r8, 8(rp) + mov %r9, 16(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/com.asm b/gmp-6.3.0/mpn/x86_64/fastsse/com.asm new file mode 100644 index 0000000..c867222 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/com.asm @@ -0,0 +1,175 @@ +dnl AMD64 mpn_com optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 2.0 N +C AMD K10 0.85 1.3 Y/N +C AMD bull 1.40 1.40 Y +C AMD pile 0.9-1.4 0.9-1.4 Y +C AMD steam +C AMD excavator +C AMD bobcat 3.1 3.1 N +C AMD jaguar 0.91 0.91 opt/opt Y +C Intel P4 2.28 illop Y +C Intel core2 1.02 1.02 N +C Intel NHM 0.53 0.68 Y +C Intel SBR 0.51 0.75 opt/0.65 Y/N +C Intel IBR 0.50 0.57 opt/opt Y +C Intel HWL 0.51 0.64 opt/0.58 Y +C Intel BWL 0.61 0.65 0.57/opt Y +C Intel atom 3.68 3.68 N +C Intel SLM 1.09 1.35 N +C VIA nano 1.17 5.09 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_com) + FUNC_ENTRY(3) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + pcmpeqb %xmm7, %xmm7 C set to 111...111 + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + mov (up), %rax + lea 8(up), up + not %rax + mov %rax, (rp) + lea 8(rp), rp + dec n + + sub $14, n + jc L(sma) + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + movdqu 64(up), %xmm4 + movdqu 80(up), %xmm5 + movdqu 96(up), %xmm6 + lea 112(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + pxor %xmm7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm7, %xmm6 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + movdqa %xmm4, 64(rp) + movdqa %xmm5, 80(rp) + movdqa %xmm6, 96(rp) + lea 112(rp), rp +L(ali): sub $14, n + jnc L(top) + +L(sma): add $14, n + test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + lea 64(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + lea 32(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea 16(up), up + pxor %xmm7, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp +1: + test $1, R8(n) + jz 1f + mov (up), %rax + not %rax + mov %rax, (rp) +1: +L(don): +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm new file mode 100644 index 0000000..fac6f8a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm @@ -0,0 +1,254 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bull 0.70 0.70 Y +C AMD pile 0.68 0.68 Y +C AMD steam +C AMD excavator +C AMD bobcat 1.97 8.24 1.5/1.5 N +C AMD jaguar 0.77 0.89 0.65/opt N/Y +C Intel P4 2.26 illop Y/N +C Intel core 0.52 0.80 opt/opt Y +C Intel NHM 0.52 0.64 opt/opt Y +C Intel SBR 0.51 0.51 opt/opt Y +C Intel IBR 0.50 0.50 opt/opt Y +C Intel HWL 0.50 0.51 opt/opt Y +C Intel BWL 0.55 0.55 opt/opt Y +C Intel atom 1.16 1.66 opt/opt Y +C Intel SLM 1.02 1.04 opt/opt Y +C VIA nano 1.08 1.06 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). +C +C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop, +C taken from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +define(`movdqa', ``movaps'') + +ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + lea -8(up,n,8), up + lea -8(rp,n,8), rp + + cmp $COPYD_SSE_THRESHOLD, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jnz L(rp_aligned) C jump if rp aligned + + mov (up), %rax C copy one limb + mov %rax, (rp) + lea -8(up), up + lea -8(rp), rp + dec n + +L(rp_aligned): + test $8, R8(up) + jz L(uent) + +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa -8(up), %xmm0 + movdqa -24(up), %xmm1 + movdqa -40(up), %xmm2 + movdqa -56(up), %xmm3 + lea -64(up), up + movdqa %xmm0, -8(rp) + movdqa %xmm1, -24(rp) + movdqa %xmm2, -40(rp) + movdqa %xmm3, -56(rp) + lea -64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa -8(up), %xmm0 + movdqa -24(up), %xmm1 + lea -32(up), up + movdqa %xmm0, -8(rp) + movdqa %xmm1, -24(rp) + lea -32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa -8(up), %xmm0 + lea -16(up), up + movdqa %xmm0, -8(rp) + lea -16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent):sub $16, n + movdqa (up), %xmm0 + jc L(uend) + + ALIGN(16) +L(utop):sub $16, n + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm2 + palignr($8, %xmm2, %xmm1) + movdqa %xmm1, -24(rp) + movdqa -48(up), %xmm3 + palignr($8, %xmm3, %xmm2) + movdqa %xmm2, -40(rp) + movdqa -64(up), %xmm0 + palignr($8, %xmm0, %xmm3) + movdqa %xmm3, -56(rp) + movdqa -80(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -72(rp) + movdqa -96(up), %xmm2 + palignr($8, %xmm2, %xmm1) + movdqa %xmm1, -88(rp) + movdqa -112(up), %xmm3 + palignr($8, %xmm3, %xmm2) + movdqa %xmm2, -104(rp) + movdqa -128(up), %xmm0 + palignr($8, %xmm0, %xmm3) + movdqa %xmm3, -120(rp) + lea -128(up), up + lea -128(rp), rp + jnc L(utop) + +L(uend):test $8, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -24(rp) + movdqa -48(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -40(rp) + movdqa -64(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -56(rp) + lea -64(up), up + lea -64(rp), rp + +1: test $4, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -24(rp) + lea -32(up), up + lea -32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + lea -16(up), up + lea -16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): sub $4, R32(n) + jc L(end) + + ALIGN(16) +L(top): mov (up), %r8 + mov -8(up), %r9 + lea -32(rp), rp + mov -16(up), %r10 + mov -24(up), %r11 + lea -32(up), up + mov %r8, 32(rp) + mov %r9, 24(rp) +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, 16(rp) + mov %r11, 8(rp) +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + lea -8(rp), rp + lea -8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov -8(up), %r9 + mov %r8, (rp) + mov %r9, -8(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm new file mode 100644 index 0000000..b3c4706 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm @@ -0,0 +1,166 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 +C AMD K10 0.85 1.64 Y/N +C AMD bull 1.4 1.4 Y +C AMD pile 0.68 0.98 Y/N +C AMD steam +C AMD excavator +C AMD bobcat +C AMD jaguar 0.65 1.02 opt/0.93 Y/N +C Intel P4 2.3 2.3 Y +C Intel core 1.0 1.0 0.52/0.80 N +C Intel NHM 0.5 0.67 Y +C Intel SBR 0.51 0.75 opt/0.54 Y/N +C Intel IBR 0.50 0.57 opt/0.50 Y +C Intel HWL 0.50 0.57 opt/0.51 Y +C Intel BWL 0.55 0.62 opt/0.55 Y +C Intel atom +C Intel SLM 1.02 1.27 opt/1.04 Y/N +C VIA nano 1.16 5.16 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`movdqu', lddqu) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + test n, n + jz L(don) + + lea -16(rp,n,8), rp + lea -16(up,n,8), up + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + mov 8(up), %rax + lea -8(up), up + mov %rax, 8(rp) + lea -8(rp), rp + dec n + +L(ali): sub $16, n + jc L(sma) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + movdqu -32(up), %xmm2 + movdqu -48(up), %xmm3 + movdqu -64(up), %xmm4 + movdqu -80(up), %xmm5 + movdqu -96(up), %xmm6 + movdqu -112(up), %xmm7 + lea -128(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + movdqa %xmm2, -32(rp) + movdqa %xmm3, -48(rp) + movdqa %xmm4, -64(rp) + movdqa %xmm5, -80(rp) + movdqa %xmm6, -96(rp) + movdqa %xmm7, -112(rp) + lea -128(rp), rp + sub $16, n + jnc L(top) + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + +L(sma): test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + movdqu -32(up), %xmm2 + movdqu -48(up), %xmm3 + lea -64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + movdqa %xmm2, -32(rp) + movdqa %xmm3, -48(rp) + lea -64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + lea -32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea -16(up), up + movdqa %xmm0, (rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 8(up), %r8 + mov %r8, 8(rp) +1: +L(don): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm new file mode 100644 index 0000000..9876a47 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm @@ -0,0 +1,300 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bd1 0.70 0.66 Y +C AMD bd2 0.68 0.66 Y +C AMD bd3 ? ? +C AMD bd4 ? ? +C AMD bt1 1.97 8.16 1.5/1.5 N +C AMD bt2 0.77 0.93 0.65/opt N/Y +C AMD zn1 ? ? +C AMD zn2 ? ? +C Intel P4 2.26 illop Y/N +C Intel CNR 0.52 0.64 opt/opt Y +C Intel NHM 0.52 0.71 0.50/0.67 N +C Intel SBR 0.51 0.54 opt/0.51 Y +C Intel IBR 0.50 0.54 opt/opt Y +C Intel HWL 0.50 0.51 opt/opt Y +C Intel BWL 0.55 0.55 opt/opt Y +C Intel atom 1.16 1.61 opt/opt Y +C Intel SLM 1.02 1.07 opt/opt Y +C VIA nano 1.09 1.08 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That +C instruction is better adapted to mpn_copyd's needs, we need to contort the +C code to use it here. +C +C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop, +C taken from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +dnl define(`movdqa', ``movaps'') + +ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $COPYI_SSE_THRESHOLD, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(rp_aligned) C jump if rp aligned + + movsq C copy one limb + dec n + +L(rp_aligned): + test $8, R8(up) + jnz L(uent) + +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa 0(up), %xmm0 + movdqa 16(up), %xmm1 + movdqa 32(up), %xmm2 + movdqa 48(up), %xmm3 + lea 64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa (up), %xmm0 + movdqa 16(up), %xmm1 + lea 32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa (up), %xmm0 + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent): +C Code handling up - rp = 8 (mod 16) + + cmp $16, n + jc L(ued0) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') +IFDOS(` movdqa %xmm8, 32(%rsp) ') + + movaps 120(up), %xmm7 + movaps 104(up), %xmm6 + movaps 88(up), %xmm5 + movaps 72(up), %xmm4 + movaps 56(up), %xmm3 + movaps 40(up), %xmm2 + lea 128(up), up + sub $32, n + jc L(ued1) + + ALIGN(16) +L(utop):movaps -104(up), %xmm1 + sub $16, n + movaps -120(up), %xmm0 + palignr($8, %xmm6, %xmm7) + movaps -136(up), %xmm8 + movdqa %xmm7, 112(rp) + palignr($8, %xmm5, %xmm6) + movaps 120(up), %xmm7 + movdqa %xmm6, 96(rp) + palignr($8, %xmm4, %xmm5) + movaps 104(up), %xmm6 + movdqa %xmm5, 80(rp) + palignr($8, %xmm3, %xmm4) + movaps 88(up), %xmm5 + movdqa %xmm4, 64(rp) + palignr($8, %xmm2, %xmm3) + movaps 72(up), %xmm4 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movaps 56(up), %xmm3 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movaps 40(up), %xmm2 + movdqa %xmm1, 16(rp) + palignr($8, %xmm8, %xmm0) + lea 128(up), up + movdqa %xmm0, (rp) + lea 128(rp), rp + jnc L(utop) + +L(ued1):movaps -104(up), %xmm1 + movaps -120(up), %xmm0 + movaps -136(up), %xmm8 + palignr($8, %xmm6, %xmm7) + movdqa %xmm7, 112(rp) + palignr($8, %xmm5, %xmm6) + movdqa %xmm6, 96(rp) + palignr($8, %xmm4, %xmm5) + movdqa %xmm5, 80(rp) + palignr($8, %xmm3, %xmm4) + movdqa %xmm4, 64(rp) + palignr($8, %xmm2, %xmm3) + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, 16(rp) + palignr($8, %xmm8, %xmm0) + movdqa %xmm0, (rp) + lea 128(rp), rp + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` movdqa 32(%rsp), %xmm8 ') +IFDOS(` add $56, %rsp ') + +L(ued0):test $8, R8(n) + jz 1f + movaps 56(up), %xmm3 + movaps 40(up), %xmm2 + movaps 24(up), %xmm1 + movaps 8(up), %xmm0 + movaps -8(up), %xmm4 + palignr($8, %xmm2, %xmm3) + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, 16(rp) + palignr($8, %xmm4, %xmm0) + lea 64(up), up + movdqa %xmm0, (rp) + lea 64(rp), rp + +1: test $4, R8(n) + jz 1f + movaps 24(up), %xmm1 + movaps 8(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movaps -8(up), %xmm3 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 32(up), up + movdqa %xmm0, (rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa 8(up), %xmm0 + movdqa -8(up), %xmm3 + palignr($8, %xmm3, %xmm0) + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): lea -8(rp), rp + sub $4, R32(n) + jc L(end) + + ALIGN(16) +L(top): mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + mov %r8, -24(rp) + mov %r9, -16(rp) +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, -8(rp) + mov %r11, (rp) +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, 8(rp) + lea 8(rp), rp + lea 8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov 8(up), %r9 + mov %r8, 8(rp) + mov %r9, 16(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm new file mode 100644 index 0000000..97f7865 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm @@ -0,0 +1,185 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 +C AMD K10 0.85 1.64 Y/N +C AMD bull 1.4 1.4 N +C AMD pile 0.77 0.93 N +C AMD steam ? ? +C AMD excavator ? ? +C AMD bobcat +C AMD jaguar 0.65 1.02 opt/0.93 Y/N +C Intel P4 2.3 2.3 Y +C Intel core 1.0 1.0 0.52/0.64 N +C Intel NHM 0.5 0.67 Y +C Intel SBR 0.51 0.75 opt/0.54 Y/N +C Intel IBR 0.50 0.57 opt/0.54 Y +C Intel HWL 0.50 0.57 opt/0.51 Y +C Intel BWL 0.55 0.62 opt/0.55 Y +C Intel atom +C Intel SLM 1.02 1.27 opt/1.07 Y/N +C VIA nano 1.16 5.16 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`movdqu', lddqu) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $3, n C NB: bc code below assumes this limit + jc L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + movsq C copy single limb + dec n + +L(ali): sub $16, n + jc L(sma) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + movdqu 64(up), %xmm4 + movdqu 80(up), %xmm5 + movdqu 96(up), %xmm6 + movdqu 112(up), %xmm7 + lea 128(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + movdqa %xmm4, 64(rp) + movdqa %xmm5, 80(rp) + movdqa %xmm6, 96(rp) + movdqa %xmm7, 112(rp) + lea 128(rp), rp + sub $16, n + jnc L(top) + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + +L(sma): test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + lea 64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + lea 32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + ALIGN(16) +1: +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) +1: + FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for correctness as +C the above code is currently written. The commented-out lines need to be +C reinstated if this code is to be used for n > 3, and then the post loop +C offsets need fixing. + +L(bc): sub $2, n + jc L(end) + ALIGN(16) +1: mov (up), %rax + mov 8(up), %rcx +dnl lea 16(up), up + mov %rax, (rp) + mov %rcx, 8(rp) +dnl lea 16(rp), rp +dnl sub $2, n +dnl jnc 1b + + test $1, R8(n) + jz L(ret) + mov 16(up), %rax + mov %rax, 16(rp) +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm new file mode 100644 index 0000000..a05e850 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm @@ -0,0 +1,182 @@ +dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 2.35 no, use shl/shr +C AMD K10 1.5-1.8 1.5-1.8 1.33 yes +C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes +C AMD bobcat 3.17 3.17 yes, bad for n < 20 +C Intel P4 4.67 4.67 2.7 no, slow movdqu +C Intel core2 2.15 2.15 1.25 no, use shld/shrd +C Intel NHM 1.66 1.66 1.25 no, use shld/shrd +C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 +C Intel atom 11.7 11.7 4.5 no +C VIA nano 5.7 5.95 2.0 no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + cmp $3, n + jle L(bc) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea 1(n), %r8d + + and $6, R32(%r8) + jz L(ba0) + cmp $4, R32(%r8) + jz L(ba4) + jc L(ba2) +L(ba6): add $-4, n + jmp L(i56) +L(ba0): add $-6, n + jmp L(i70) +L(ba4): add $-2, n + jmp L(i34) +L(ba2): add $-8, n + jle L(end) + + ALIGN(16) +L(top): movdqu 40(ap,n,8), %xmm1 + movdqu 48(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 48(rp,n,8) +L(i70): + movdqu 24(ap,n,8), %xmm1 + movdqu 32(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 32(rp,n,8) +L(i56): + movdqu 8(ap,n,8), %xmm1 + movdqu 16(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 16(rp,n,8) +L(i34): + movdqu -8(ap,n,8), %xmm1 + movdqu (ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) + sub $8, n + jg L(top) + +L(end): test $1, R8(n) + jnz L(end8) + + movdqu (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jz L(end8) + + movq (ap,n,8), %xmm1 + movq -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (rp,n,8) + sub $2, R32(n) + jl L(end8) + movq 8(ap), %xmm1 + movq (ap), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm new file mode 100644 index 0000000..6a17b93 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm @@ -0,0 +1,173 @@ +dnl AMD64 mpn_lshift optimised for CPUs with fast SSE. + +dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. + +dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb good +C 16-byte aligned 16-byte unaligned for cpu? +C AMD K8,K9 ? ? +C AMD K10 1.68 (1.45) 1.75 (1.49) Y +C AMD bd1 1.82 (1.75) 1.82 (1.75) Y +C AMD bobcat 4 4 +C Intel P4 3 (2.7) 3 (2.7) Y +C Intel core2 2.05 (1.67) 2.55 (1.75) +C Intel NHM 2.05 (1.75) 2.09 (2) +C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y +C Intel atom ? ? +C VIA nano 2.25 (2) 2.5 (2) Y + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. + +C There are two inner-loops, one for when rp = ap (mod 16) and one when this is +C not true. The aligned case reads 16+8 bytes, the unaligned case reads +C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. + +C This is not yet great code: +C (1) The unaligned case makes many reads. +C (2) We should do some unrolling, at least 2-way. +C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on +C Nano. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + cmp $2, n + jle L(le2) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea (ap,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(aent) + jmp L(uent) +C ***************************************************************************** + +C Handle the case when ap != rp (mod 16). + + ALIGN(16) +L(utop):movdqa -8(ap,n,8), %xmm0 + movq (ap,n,8), %xmm1 + punpcklqdq 8(ap,n,8), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) +L(uent):sub $2, n + ja L(utop) + + jne L(end8) + + movq (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + punpcklqdq 8(ap), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + +C Handle the case when ap = rp (mod 16). + + ALIGN(16) +L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] + movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] + punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) +L(aent): + sub $2, n + ja L(atop) + jne L(end8) + + movdqa (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + + ALIGN(16) +L(le2): jne L(end8) + + movq 8(ap), %xmm0 + movq (ap), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm new file mode 100644 index 0000000..8250910 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm @@ -0,0 +1,193 @@ +dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 ? no, use shl/shr +C AMD K10 1.8-2.0 1.8-2.0 ? yes +C AMD bd1 1.9 1.9 ? yes +C AMD bobcat 3.67 3.67 yes, bad for n < 20 +C Intel P4 4.75 4.75 ? no, slow movdqu +C Intel core2 2.27 2.27 ? no, use shld/shrd +C Intel NHM 2.15 2.15 ? no, use shld/shrd +C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6 +C Intel atom 12.9 12.9 ? no +C VIA nano 6.18 6.44 ? no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + pcmpeqb %xmm3, %xmm3 C set to 111...111 + + cmp $3, n + jle L(bc) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea 1(n), %r8d + + and $6, R32(%r8) + jz L(ba0) + cmp $4, R32(%r8) + jz L(ba4) + jc L(ba2) +L(ba6): add $-4, n + jmp L(i56) +L(ba0): add $-6, n + jmp L(i70) +L(ba4): add $-2, n + jmp L(i34) +L(ba2): add $-8, n + jle L(end) + + ALIGN(16) +L(top): movdqu 40(ap,n,8), %xmm1 + movdqu 48(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 48(rp,n,8) +L(i70): + movdqu 24(ap,n,8), %xmm1 + movdqu 32(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 32(rp,n,8) +L(i56): + movdqu 8(ap,n,8), %xmm1 + movdqu 16(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 16(rp,n,8) +L(i34): + movdqu -8(ap,n,8), %xmm1 + movdqu (ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, (rp,n,8) + sub $8, n + jg L(top) + +L(end): test $1, R8(n) + jnz L(end8) + + movdqu (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jz L(end8) + + movq (ap,n,8), %xmm1 + movq -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, (rp,n,8) + sub $2, R32(n) + jl L(end8) + movq 8(ap), %xmm1 + movq (ap), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm new file mode 100644 index 0000000..a616075 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm @@ -0,0 +1,183 @@ +dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE. + +dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. + +dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb good +C 16-byte aligned 16-byte unaligned for cpu? +C AMD K8,K9 ? ? +C AMD K10 1.85 (1.635) 1.9 (1.67) Y +C AMD bd1 1.82 (1.75) 1.82 (1.75) Y +C AMD bobcat 4.5 4.5 +C Intel P4 3.6 (3.125) 3.6 (3.125) Y +C Intel core2 2.05 (1.67) 2.55 (1.75) +C Intel NHM 2.05 (1.875) 2.6 (2.25) +C Intel SBR 1.55 (1.44) 2 (1.57) Y +C Intel atom ? ? +C VIA nano 2.5 (2.5) 2.5 (2.5) Y + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We always write using +C 16-byte operations, we read with both 8-byte and 16-byte operations. + +C There are two inner-loops, one for when rp = ap (mod 16) and one when this is +C not true. The aligned case reads 16+8 bytes, the unaligned case reads +C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. + +C This is not yet great code: +C (1) The unaligned case makes too many reads. +C (2) We should do some unrolling, at least 2-way. +C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on +C Nano. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + pcmpeqb %xmm2, %xmm2 C set to 111...111 + + cmp $2, n + jle L(le2) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea (ap,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(aent) + jmp L(uent) +C ***************************************************************************** + +C Handle the case when ap != rp (mod 16). + + ALIGN(16) +L(utop):movq (ap,n,8), %xmm1 + punpcklqdq 8(ap,n,8), %xmm1 + movdqa -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp,n,8) +L(uent):sub $2, n + ja L(utop) + + jne L(end8) + + movq (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + punpcklqdq 8(ap), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + +C Handle the case when ap = rp (mod 16). + + ALIGN(16) +L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] + movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] + punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp,n,8) +L(aent):sub $2, n + ja L(atop) + + jne L(end8) + + movdqa (ap), %xmm0 + pxor %xmm1, %xmm1 + punpcklqdq %xmm0, %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + + ALIGN(16) +L(le2): jne L(end8) + + movq 8(ap), %xmm0 + movq (ap), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm b/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm new file mode 100644 index 0000000..1e270b1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm @@ -0,0 +1,201 @@ +dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 2.35 no, use shl/shr +C AMD K10 1.5-1.8 1.5-1.8 1.33 yes +C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes +C AMD bobcat 3.17 3.17 yes, bad for n < 20 +C Intel P4 4.67 4.67 2.7 no, slow movdqu +C Intel core2 2.15 2.15 1.25 no, use shld/shrd +C Intel NHM 1.66 1.66 1.25 no, use shld/shrd +C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 +C Intel atom 11.7 11.7 4.5 no +C VIA nano 5.7 5.95 2.0 no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov (ap), %rax + shl R8(%rcx), %rax + + cmp $3, n + jle L(bc) + + test $8, R8(rp) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq (ap), %xmm0 + movq 8(ap), %xmm1 + psrlq %xmm4, %xmm0 + psllq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, (rp) + lea 8(ap), ap + lea 8(rp), rp + dec n + +L(rp_aligned): + lea 1(n), %r8d + lea (ap,n,8), ap + lea (rp,n,8), rp + neg n + + and $6, R32(%r8) + jz L(bu0) + cmp $4, R32(%r8) + jz L(bu4) + jc L(bu2) +L(bu6): add $4, n + jmp L(i56) +L(bu0): add $6, n + jmp L(i70) +L(bu4): add $2, n + jmp L(i34) +L(bu2): add $8, n + jge L(end) + + ALIGN(16) +L(top): movdqu -64(ap,n,8), %xmm1 + movdqu -56(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -64(rp,n,8) +L(i70): + movdqu -48(ap,n,8), %xmm1 + movdqu -40(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -48(rp,n,8) +L(i56): + movdqu -32(ap,n,8), %xmm1 + movdqu -24(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -32(rp,n,8) +L(i34): + movdqu -16(ap,n,8), %xmm1 + movdqu -8(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -16(rp,n,8) + add $8, n + jl L(top) + +L(end): test $1, R8(n) + jnz L(e1) + + movdqu -16(ap), %xmm1 + movq -8(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, -16(rp) + FUNC_EXIT() + ret + +L(e1): movq -8(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, -8(rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jnz 1f + movq (ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret + +1: movq (ap), %xmm1 + movq 8(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (rp) + dec R32(n) + jnz 1f + movq 8(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 8(rp) + FUNC_EXIT() + ret + +1: movq 8(ap), %xmm1 + movq 16(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + movq 16(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm new file mode 100644 index 0000000..e7b7feb --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm @@ -0,0 +1,204 @@ +dnl AMD64 SSE mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb +C ali,evn n unal,evn n other cases +C AMD K8,K9 1.65 1.65 1.8 +C AMD K10 0.78 0.78 0.85 +C AMD bd1 0.80 0.91 1.25 +C AMD bobcat 2.15 2.15 2.37 +C Intel P4 2.5 2.5 2.95 +C Intel core2 1.17 1.25 1.25 +C Intel NHM 0.87 0.90 0.90 +C Intel SBR 0.63 0.79 0.77 +C Intel atom 4.3 4.3 4.3 slower than plain code +C VIA nano 1.4 5.1 3.14 too alignment dependent + +C NOTES +C * We only honour the least significant 32 bits of the `which' and `nents' +C arguments to allow efficient code using just SSE2. We would need to +C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence. +C * We use movd for copying between xmm and plain registers, since old gas +C rejects movq. But gas assembles movd as movq when given a 64-bit greg. + +define(`rp', `%rdi') +define(`tp', `%rsi') +define(`n', `%rdx') +define(`nents', `%rcx') +define(`which', `%r8') + +define(`i', `%r10') +define(`j', `%r9') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C nents n rp tab which j i temp * * * * + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sec_tabselect) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + +IFDOS(` add $-88, %rsp ') +IFDOS(` movdqu %xmm6, (%rsp) ') +IFDOS(` movdqu %xmm7, 16(%rsp) ') +IFDOS(` movdqu %xmm8, 32(%rsp) ') +IFDOS(` movdqu %xmm9, 48(%rsp) ') + + movd which, %xmm8 + pshufd $0, %xmm8, %xmm8 C 4 `which' copies + mov $1, R32(%rax) + movd %rax, %xmm9 + pshufd $0, %xmm9, %xmm9 C 4 copies of 1 + + mov n, j + add $-8, j + js L(outer_end) + +L(outer_top): + mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + ALIGN(16) +L(top): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + movdqu 16(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm4 + por %xmm3, %xmm5 + movdqu 32(tp), %xmm2 + movdqu 48(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + lea (tp,n,8), tp + add $-1, i + jne L(top) + + movdqu %xmm4, 0(rp) + movdqu %xmm5, 16(rp) + movdqu %xmm6, 32(rp) + movdqu %xmm7, 48(rp) + + lea 64(%r11), tp + lea 64(rp), rp + add $-8, j + jns L(outer_top) +L(outer_end): + + test $4, R8(n) + je L(b0xx) +L(b1xx):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + ALIGN(16) +L(tp4): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + movdqu 16(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm4 + por %xmm3, %xmm5 + lea (tp,n,8), tp + add $-1, i + jne L(tp4) + movdqu %xmm4, 0(rp) + movdqu %xmm5, 16(rp) + lea 32(%r11), tp + lea 32(rp), rp + +L(b0xx):test $2, R8(n) + je L(b00x) +L(b01x):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + ALIGN(16) +L(tp2): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + pand %xmm0, %xmm2 + por %xmm2, %xmm4 + lea (tp,n,8), tp + add $-1, i + jne L(tp2) + movdqu %xmm4, 0(rp) + lea 16(%r11), tp + lea 16(rp), rp + +L(b00x):test $1, R8(n) + je L(b000) +L(b001):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + ALIGN(16) +L(tp1): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movq 0(tp), %xmm2 + pand %xmm0, %xmm2 + por %xmm2, %xmm4 + lea (tp,n,8), tp + add $-1, i + jne L(tp1) + movq %xmm4, 0(rp) + +L(b000): +IFDOS(` movdqu (%rsp), %xmm6 ') +IFDOS(` movdqu 16(%rsp), %xmm7 ') +IFDOS(` movdqu 32(%rsp), %xmm8 ') +IFDOS(` movdqu 48(%rsp), %xmm9 ') +IFDOS(` add $88, %rsp ') + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c b/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c new file mode 100644 index 0000000..e0d7358 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/addmul_2.c @@ -0,0 +1,38 @@ +/* Fat binary fallback mpn_addmul_2. + +Copyright 2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, const mp_limb_t vp[2]) +{ + rp[n] = mpn_addmul_1 (rp, up, n, vp[0]); + return mpn_addmul_1 (rp + 1, up, n, vp[1]); +} diff --git a/gmp-6.3.0/mpn/x86_64/fat/fat.c b/gmp-6.3.0/mpn/x86_64/fat/fat.c new file mode 100644 index 0000000..cc35afa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/fat.c @@ -0,0 +1,473 @@ +/* x86_64 fat binary initializers. + + Contributed to the GNU project by Kevin Ryde (original x86_32 code) and + Torbjorn Granlund (port to x86_64) + + THE FUNCTIONS AND VARIABLES IN THIS FILE ARE FOR INTERNAL USE ONLY. + THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR + COMPLETELY IN FUTURE GNU MP RELEASES. + +Copyright 2003, 2004, 2009, 2011-2015, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include /* for printf */ +#include /* for getenv */ +#include + +#include "gmp-impl.h" + +/* Change this to "#define TRACE(x) x" for some traces. */ +#define TRACE(x) + + +/* fat_entry.asm */ +long __gmpn_cpuid (char [12], int); + + +#if WANT_FAKE_CPUID +/* The "name"s in the table are values for the GMP_CPU_TYPE environment + variable. Anything can be used, but for now it's the canonical cpu types + as per config.guess/config.sub. */ + +#define __gmpn_cpuid fake_cpuid + +#define MAKE_FMS(family, model) \ + ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20) \ + + (((model) & 0xf) << 4) + (((model) & 0xf0) << 12)) + +static struct { + const char *name; + const char *vendor; + unsigned fms; +} fake_cpuid_table[] = { + { "core2", "GenuineIntel", MAKE_FMS (6, 0xf) }, + { "nehalem", "GenuineIntel", MAKE_FMS (6, 0x1a) }, + { "nhm", "GenuineIntel", MAKE_FMS (6, 0x1a) }, + { "atom", "GenuineIntel", MAKE_FMS (6, 0x1c) }, + { "westmere", "GenuineIntel", MAKE_FMS (6, 0x25) }, + { "wsm", "GenuineIntel", MAKE_FMS (6, 0x25) }, + { "sandybridge","GenuineIntel", MAKE_FMS (6, 0x2a) }, + { "sbr", "GenuineIntel", MAKE_FMS (6, 0x2a) }, + { "silvermont", "GenuineIntel", MAKE_FMS (6, 0x37) }, + { "slm", "GenuineIntel", MAKE_FMS (6, 0x37) }, + { "haswell", "GenuineIntel", MAKE_FMS (6, 0x3c) }, + { "hwl", "GenuineIntel", MAKE_FMS (6, 0x3c) }, + { "broadwell", "GenuineIntel", MAKE_FMS (6, 0x3d) }, + { "bwl", "GenuineIntel", MAKE_FMS (6, 0x3d) }, + { "skylake", "GenuineIntel", MAKE_FMS (6, 0x5e) }, + { "sky", "GenuineIntel", MAKE_FMS (6, 0x5e) }, + { "pentium4", "GenuineIntel", MAKE_FMS (15, 3) }, + + { "k8", "AuthenticAMD", MAKE_FMS (15, 0) }, + { "k10", "AuthenticAMD", MAKE_FMS (16, 0) }, + { "bobcat", "AuthenticAMD", MAKE_FMS (20, 1) }, + { "bulldozer", "AuthenticAMD", MAKE_FMS (21, 1) }, + { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) }, + { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) }, + { "excavator", "AuthenticAMD", MAKE_FMS (21, 0x60) }, + { "jaguar", "AuthenticAMD", MAKE_FMS (22, 1) }, + { "zen", "AuthenticAMD", MAKE_FMS (23, 1) }, + + { "nano", "CentaurHauls", MAKE_FMS (6, 15) }, +}; + +static int +fake_cpuid_lookup (void) +{ + char *s; + int i; + + s = getenv ("GMP_CPU_TYPE"); + if (s == NULL) + { + printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n"); + abort (); + } + + for (i = 0; i < numberof (fake_cpuid_table); i++) + if (strcmp (s, fake_cpuid_table[i].name) == 0) + return i; + + printf ("GMP_CPU_TYPE=%s unknown\n", s); + abort (); +} + +static long +fake_cpuid (char dst[12], unsigned int id) +{ + int i = fake_cpuid_lookup(); + + switch (id) { + case 0: + memcpy (dst, fake_cpuid_table[i].vendor, 12); + return 0; + case 1: + return fake_cpuid_table[i].fms; + case 7: + dst[0] = 0xff; /* BMI1, AVX2, etc */ + dst[1] = 0xff; /* BMI2, etc */ + return 0; + case 0x80000001: + dst[4 + 29 / 8] = (1 << (29 % 8)); /* "long" mode */ + return 0; + default: + printf ("fake_cpuid(): oops, unknown id %d\n", id); + abort (); + } +} +#endif + + +typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t)); +typedef DECL_preinv_mod_1 ((*preinv_mod_1_t)); + +struct cpuvec_t __gmpn_cpuvec = { + __MPN(add_n_init), + __MPN(addlsh1_n_init), + __MPN(addlsh2_n_init), + __MPN(addmul_1_init), + __MPN(addmul_2_init), + __MPN(bdiv_dbm1c_init), + __MPN(cnd_add_n_init), + __MPN(cnd_sub_n_init), + __MPN(com_init), + __MPN(copyd_init), + __MPN(copyi_init), + __MPN(divexact_1_init), + __MPN(divrem_1_init), + __MPN(gcd_11_init), + __MPN(lshift_init), + __MPN(lshiftc_init), + __MPN(mod_1_init), + __MPN(mod_1_1p_init), + __MPN(mod_1_1p_cps_init), + __MPN(mod_1s_2p_init), + __MPN(mod_1s_2p_cps_init), + __MPN(mod_1s_4p_init), + __MPN(mod_1s_4p_cps_init), + __MPN(mod_34lsub1_init), + __MPN(modexact_1c_odd_init), + __MPN(mul_1_init), + __MPN(mul_basecase_init), + __MPN(mullo_basecase_init), + __MPN(preinv_divrem_1_init), + __MPN(preinv_mod_1_init), + __MPN(redc_1_init), + __MPN(redc_2_init), + __MPN(rshift_init), + __MPN(sqr_basecase_init), + __MPN(sub_n_init), + __MPN(sublsh1_n_init), + __MPN(submul_1_init), + 0 +}; + +int __gmpn_cpuvec_initialized = 0; + +/* The following setups start with generic x86, then overwrite with + specifics for a chip, and higher versions of that chip. + + The arrangement of the setups here will normally be the same as the $path + selections in configure.in for the respective chips. + + This code is reentrant and thread safe. We always calculate the same + decided_cpuvec, so if two copies of the code are running it doesn't + matter which completes first, both write the same to __gmpn_cpuvec. + + We need to go via decided_cpuvec because if one thread has completed + __gmpn_cpuvec then it may be making use of the threshold values in that + vector. If another thread is still running __gmpn_cpuvec_init then we + don't want it to write different values to those fields since some of the + asm routines only operate correctly up to their own defined threshold, + not an arbitrary value. */ + +static int +gmp_workaround_skylake_cpuid_bug () +{ + char feature_string[49]; + char processor_name_string[49]; + static const char *bad_cpus[] = {" G44", " G45", " G39" /* , "6600" */ }; + int i; + + /* Example strings: */ + /* "Intel(R) Pentium(R) CPU G4400 @ 3.30GHz" */ + /* "Intel(R) Core(TM) i5-6600K CPU @ 3.50GHz" */ + /* ^ ^ ^ */ + /* 0x80000002 0x80000003 0x80000004 */ + /* We match out just the 0x80000003 part here. */ + + /* In their infinitive wisdom, Intel decided to use one register order for + the vendor string, and another for the processor name string. We shuffle + things about here, rather than write a new variant of our assembly cpuid. + */ + + unsigned int eax, ebx, ecx, edx; + eax = __gmpn_cpuid (feature_string, 0x80000003); + ebx = ((unsigned int *)feature_string)[0]; + edx = ((unsigned int *)feature_string)[1]; + ecx = ((unsigned int *)feature_string)[2]; + + ((unsigned int *) (processor_name_string))[0] = eax; + ((unsigned int *) (processor_name_string))[1] = ebx; + ((unsigned int *) (processor_name_string))[2] = ecx; + ((unsigned int *) (processor_name_string))[3] = edx; + + processor_name_string[16] = 0; + + for (i = 0; i < sizeof (bad_cpus) / sizeof (char *); i++) + { + if (strstr (processor_name_string, bad_cpus[i]) != 0) + return 1; + } + return 0; +} + +enum {BMI2_BIT = 8}; + +void +__gmpn_cpuvec_init (void) +{ + struct cpuvec_t decided_cpuvec; + char vendor_string[13]; + char dummy_string[12]; + long fms; + int family, model; + + TRACE (printf ("__gmpn_cpuvec_init:\n")); + + memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec)); + + CPUVEC_SETUP_x86_64; + CPUVEC_SETUP_fat; + + __gmpn_cpuid (vendor_string, 0); + vendor_string[12] = 0; + + fms = __gmpn_cpuid (dummy_string, 1); + family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff); + model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0); + + /* Check extended feature flags */ + __gmpn_cpuid (dummy_string, 0x80000001); + if ((dummy_string[4 + 29 / 8] & (1 << (29 % 8))) == 0) + abort (); /* longmode-capable-bit turned off! */ + + /*********************************************************/ + /*** WARNING: keep this list in sync with config.guess ***/ + /*********************************************************/ + if (strcmp (vendor_string, "GenuineIntel") == 0) + { + switch (family) + { + case 6: + switch (model) + { + case 0x0f: /* Conroe Merom Kentsfield Allendale */ + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: /* PNR Wolfdale Yorkfield */ + case 0x18: + case 0x19: + case 0x1d: /* PNR Dunnington */ + CPUVEC_SETUP_core2; + break; + + case 0x1c: /* Atom Silverthorne */ + case 0x26: /* Atom Lincroft */ + case 0x27: /* Atom Saltwell? */ + case 0x36: /* Atom Cedarview/Saltwell */ + CPUVEC_SETUP_atom; + break; + + case 0x1a: /* NHM Gainestown */ + case 0x1b: + case 0x1e: /* NHM Lynnfield/Jasper */ + case 0x1f: + case 0x20: + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: /* WSM Clarkdale/Arrandale */ + case 0x28: + case 0x29: + case 0x2b: + case 0x2c: /* WSM Gulftown */ + case 0x2e: /* NHM Beckton */ + case 0x2f: /* WSM Eagleton */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + break; + + case 0x37: /* Silvermont */ + case 0x4a: /* Silvermont */ + case 0x4c: /* Airmont */ + case 0x4d: /* Silvermont/Avoton */ + case 0x5a: /* Silvermont */ + CPUVEC_SETUP_atom; + CPUVEC_SETUP_silvermont; + break; + + case 0x5c: /* Goldmont */ + case 0x5f: /* Goldmont */ + case 0x7a: /* Goldmont Plus */ + CPUVEC_SETUP_atom; + CPUVEC_SETUP_silvermont; + CPUVEC_SETUP_goldmont; + break; + + case 0x2a: /* SB */ + case 0x2d: /* SBC-EP */ + case 0x3a: /* IBR */ + case 0x3e: /* IBR Ivytown */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + break; + case 0x3c: /* Haswell client */ + case 0x3f: /* Haswell server */ + case 0x45: /* Haswell ULT */ + case 0x46: /* Crystal Well */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + /* Some Haswells lack BMI2. Let them appear as Sandybridges for + now. */ + __gmpn_cpuid (dummy_string, 7); + if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0) + break; + CPUVEC_SETUP_coreihwl; + break; + case 0x3d: /* Broadwell */ + case 0x47: /* Broadwell */ + case 0x4f: /* Broadwell server */ + case 0x56: /* Broadwell microserver */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0) + break; + CPUVEC_SETUP_coreihwl; + CPUVEC_SETUP_coreibwl; + break; + case 0x4e: /* Skylake client */ + case 0x55: /* Skylake server */ + case 0x5e: /* Skylake */ + case 0x8e: /* Kabylake */ + case 0x9e: /* Kabylake */ + CPUVEC_SETUP_core2; + CPUVEC_SETUP_coreinhm; + CPUVEC_SETUP_coreisbr; + if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0) + break; + if (gmp_workaround_skylake_cpuid_bug ()) + break; + CPUVEC_SETUP_coreihwl; + CPUVEC_SETUP_coreibwl; + CPUVEC_SETUP_skylake; + break; + } + break; + + case 15: + CPUVEC_SETUP_pentium4; + break; + } + } + else if (strcmp (vendor_string, "AuthenticAMD") == 0) + { + switch (family) + { + case 0x0f: /* k8 */ + case 0x11: /* "fam 11h", mix of k8 and k10 */ + case 0x13: + CPUVEC_SETUP_k8; + break; + + case 0x10: /* k10 */ + case 0x12: /* k10 (llano) */ + CPUVEC_SETUP_k8; + CPUVEC_SETUP_k10; + break; + + case 0x14: /* bobcat */ + CPUVEC_SETUP_k8; + CPUVEC_SETUP_k10; + CPUVEC_SETUP_bt1; + break; + + case 0x16: /* jaguar */ + CPUVEC_SETUP_k8; + CPUVEC_SETUP_k10; + CPUVEC_SETUP_bt1; + CPUVEC_SETUP_bt2; + break; + + case 0x15: /* bulldozer, piledriver, steamroller, excavator */ + CPUVEC_SETUP_k8; + CPUVEC_SETUP_k10; + CPUVEC_SETUP_bd1; + break; + + case 0x17: /* zen */ + case 0x19: /* zen3 */ + CPUVEC_SETUP_zen; + break; + } + } + else if (strcmp (vendor_string, "CentaurHauls") == 0) + { + switch (family) + { + case 6: + if (model >= 15) + CPUVEC_SETUP_nano; + break; + } + } + + /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1. + Instead default to the plain versions from whichever CPU we detected. + The function arguments are compatible, no need for any glue code. */ + if (decided_cpuvec.preinv_divrem_1 == NULL) + decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1; + if (decided_cpuvec.preinv_mod_1 == NULL) + decided_cpuvec.preinv_mod_1 =(preinv_mod_1_t) decided_cpuvec.mod_1; + + ASSERT_CPUVEC (decided_cpuvec); + CPUVEC_INSTALL (decided_cpuvec); + + /* Set this once the threshold fields are ready. + Use volatile to prevent it getting moved. */ + *((volatile int *) &__gmpn_cpuvec_initialized) = 1; +} diff --git a/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm b/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm new file mode 100644 index 0000000..5f244ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/fat_entry.asm @@ -0,0 +1,209 @@ +dnl x86 fat binary entrypoints. + +dnl Contributed to the GNU project by Kevin Ryde (original x86_32 code) and +dnl Torbjorn Granlund (port to x86_64) + +dnl Copyright 2003, 2009, 2011-2014, 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +dnl Forcibly disable profiling. +dnl +dnl The entrypoints and inits are small enough not to worry about, the real +dnl routines arrived at will have any profiling. Also, the way the code +dnl here ends with a jump means we won't work properly with the +dnl "instrument" profiling scheme anyway. + +define(`WANT_PROFILING',no) + + +dnl We define PRETEND_PIC as a helper symbol, the use it for suppressing +dnl normal, fast call code, since that triggers problems on Darwin, OpenBSD +dnl and some versions of GNU/Linux. This will go away when symbol hiding is +dnl finished. + +ifdef(`DARWIN', +`define(`PRETEND_PIC')') +ifdef(`OPENBSD', +`define(`PRETEND_PIC')') +ifdef(`LINUX', +`define(`PRETEND_PIC')') +ifdef(`PIC', +`define(`PRETEND_PIC')') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + + TEXT + +dnl Usage: FAT_ENTRY(name, offset) +dnl +dnl Emit a fat binary entrypoint function of the given name. This is the +dnl normal entry for applications, eg. __gmpn_add_n. +dnl +dnl The code simply jumps through the function pointer in __gmpn_cpuvec at +dnl the given "offset" (in bytes). +dnl +dnl For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be +dnl fine for all x86s. +dnl +dnl For ELF/DARWIN PIC, the jumps are 20 bytes each, and are best aligned to +dnl 16 to ensure at least the first two instructions don't cross a cache line +dnl boundary. +dnl +dnl For DOS64, the jumps are 6 bytes. The same form works also for GNU/Linux +dnl (at least with certain assembler/linkers) but FreeBSD 8.2 crashes. Not +dnl tested on Darwin, Slowaris, NetBSD, etc. +dnl +dnl Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE +dnl grepping in configure, stopping that code trying to eval something with +dnl $1 in it. + +define(FAT_ENTRY, +m4_assert_numargs(2) +`ifdef(`HOST_DOS64', +` ALIGN(8) +`'PROLOGUE($1) + jmp *$2+GSYM_PREFIX`'__gmpn_cpuvec(%rip) +EPILOGUE() +', +` ALIGN(ifdef(`PIC',16,8)) +`'PROLOGUE($1) +ifdef(`PRETEND_PIC', +` LEA( GSYM_PREFIX`'__gmpn_cpuvec, %rax) + jmp *$2(%rax) +',`dnl non-PIC + jmp *GSYM_PREFIX`'__gmpn_cpuvec+$2 +') +EPILOGUE() +')') + + +dnl FAT_ENTRY for each CPUVEC_FUNCS_LIST +dnl + +define(`CPUVEC_offset',0) +foreach(i, +`FAT_ENTRY(MPN(i),CPUVEC_offset) +define(`CPUVEC_offset',eval(CPUVEC_offset + 8))', +CPUVEC_FUNCS_LIST) + + +dnl Usage: FAT_INIT(name, offset) +dnl +dnl Emit a fat binary initializer function of the given name. These +dnl functions are the initial values for the pointers in __gmpn_cpuvec. +dnl +dnl The code simply calls __gmpn_cpuvec_init, and then jumps back through +dnl the __gmpn_cpuvec pointer, at the given "offset" (in bytes). +dnl __gmpn_cpuvec_init will have stored the address of the selected +dnl implementation there. +dnl +dnl Only one of these routines will be executed, and only once, since after +dnl that all the __gmpn_cpuvec pointers go to real routines. So there's no +dnl need for anything special here, just something small and simple. To +dnl keep code size down, "fat_init" is a shared bit of code, arrived at +dnl with the offset in %al. %al is used since the movb instruction is 2 +dnl bytes where %eax would be 4. +dnl +dnl Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the +dnl HAVE_NATIVE grepping in configure, preventing that code trying to eval +dnl something with $1 in it. +dnl +dnl We need to preserve parameter registers over the __gmpn_cpuvec_init call + +define(FAT_INIT, +m4_assert_numargs(2) +`PROLOGUE($1) + mov $`'$2, %al + jmp L(fat_init) +EPILOGUE() +') + +dnl FAT_INIT for each CPUVEC_FUNCS_LIST +dnl + +define(`CPUVEC_offset',0) +foreach(i, +`FAT_INIT(MPN(i`'_init),CPUVEC_offset) +define(`CPUVEC_offset',eval(CPUVEC_offset + 1))', +CPUVEC_FUNCS_LIST) + +L(fat_init): + C al __gmpn_cpuvec byte offset + + movzbl %al, %eax +IFSTD(` push %rdi ') +IFSTD(` push %rsi ') + push %rdx + push %rcx + push %r8 + push %r9 + push %rax +IFDOS(` sub $32, %rsp ') + CALL( __gmpn_cpuvec_init) +IFDOS(` add $32, %rsp ') + pop %rax + pop %r9 + pop %r8 + pop %rcx + pop %rdx +IFSTD(` pop %rsi ') +IFSTD(` pop %rdi ') +ifdef(`PRETEND_PIC',` + LEA( GSYM_PREFIX`'__gmpn_cpuvec, %r10) + jmp *(%r10,%rax,8) +',`dnl non-PIC + jmp *GSYM_PREFIX`'__gmpn_cpuvec(,%rax,8) +') + + +C long __gmpn_cpuid (char dst[12], int id); +C +C This is called only 3 times, so just something simple and compact is fine. +C +C The rcx/ecx zeroing here is needed for the BMI2 check. + +define(`rp', `%rdi') +define(`idx', `%rsi') + +PROLOGUE(__gmpn_cpuid) + FUNC_ENTRY(2) + mov %rbx, %r8 + mov R32(idx), R32(%rax) + xor %ecx, %ecx + cpuid + mov %ebx, (rp) + mov %edx, 4(rp) + mov %ecx, 8(rp) + mov %r8, %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h new file mode 100644 index 0000000..005c893 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/gmp-mparam.h @@ -0,0 +1,72 @@ +/* Fat binary x86_64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2003, 2009, 2011 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + + +/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes. The only time + this might not be true currently is for actual 80386 and 80486 chips, + where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but + that's not worth worrying about. */ +#define DIVEXACT_1_THRESHOLD 0 + +/* Only some of the x86s have an mpn_preinv_divrem_1, but we set + USE_PREINV_DIVREM_1 so that all callers use it, and then let the + __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual + preinv. */ +#define USE_PREINV_DIVREM_1 1 + +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need + for mpn_sqr to call the latter. */ +#define SQR_BASECASE_THRESHOLD 0 + +/* Sensible fallbacks for these, when not taken from a cpu-specific + gmp-mparam.h. */ +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 130 +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 200 + +/* These are values more or less in the middle of what the typical x86 chips + come out as. For a fat binary it's necessary to have values for these, + since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out + as non-constant array initializers. FIXME: Perhaps these should be done + in the cpuvec structure like other thresholds. */ +#define MUL_FFT_TABLE { 464, 928, 1920, 3584, 10240, 40960, 0 } +#define MUL_FFT_MODF_THRESHOLD 400 +#define MUL_FFT_THRESHOLD 2000 + +#define SQR_FFT_TABLE { 528, 1184, 1920, 4608, 14336, 40960, 0 } +#define SQR_FFT_MODF_THRESHOLD 500 +#define SQR_FFT_THRESHOLD 3000 diff --git a/gmp-6.3.0/mpn/x86_64/fat/mod_1.c b/gmp-6.3.0/mpn/x86_64/fat/mod_1.c new file mode 100644 index 0000000..4f149cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/mod_1.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_mod_1. + +Copyright 2003, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mod_1.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c new file mode 100644 index 0000000..d9eb471 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/mul_basecase.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_mul_basecase. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mul_basecase.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c new file mode 100644 index 0000000..7f86be6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/mullo_basecase.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_mullo_basecase. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/mullo_basecase.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/redc_1.c b/gmp-6.3.0/mpn/x86_64/fat/redc_1.c new file mode 100644 index 0000000..0025403 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/redc_1.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_redc_1. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/redc_1.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/redc_2.c b/gmp-6.3.0/mpn/x86_64/fat/redc_2.c new file mode 100644 index 0000000..1932d58 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/redc_2.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_redc_2. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/redc_2.c" diff --git a/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c b/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c new file mode 100644 index 0000000..d1c5dcd --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fat/sqr_basecase.c @@ -0,0 +1,32 @@ +/* Fat binary fallback mpn_sqr_basecase. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "mpn/generic/sqr_basecase.c" diff --git a/gmp-6.3.0/mpn/x86_64/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/gcd_11.asm new file mode 100644 index 0000000..f9b3bcc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/gcd_11.asm @@ -0,0 +1,114 @@ +dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 5.5 +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 7.1 +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom 9.1 +C Intel SLM 6.9 +C Intel GLM 6.0 +C Intel GLM+ 5.8 +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + LEA( ctz_table, %r8) + jmp L(ent) + + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) +L(mid): and $MASK, R32(%rdx) + movzbl (%r8,%rdx), R32(%rcx) + jz L(shift_alot) + shr R8(%rcx), u0 +L(ent): mov u0, %rax + mov v0, %rdx + sub u0, %rdx + sub v0, u0 + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret + +L(shift_alot): + shr $MAXSHIFT, u0 + mov u0, %rdx + jmp L(mid) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/gcd_22.asm new file mode 100644 index 0000000..78f985f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/gcd_22.asm @@ -0,0 +1,163 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, no tzcnt, no shlx. +dnl We actually use tzcnt here, when table cannot count bits, as tzcnt always +dnl works for our use, and helps a lot for certain CPUs. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 8.9 +C AMD K10 8.8 +C AMD bd1 9.7 +C AMD bd2 7.8 +C AMD bd3 ? +C AMD bd4 7.4 +C AMD bt1 9.2 +C AMD bt2 9.1 +C AMD zn1 7.5 +C AMD zn2 7.5 +C Intel P4 ? +C Intel CNR 10.5 +C Intel PNR 10.5 +C Intel NHM 9.7 +C Intel WSM 9.7 +C Intel SBR 10.7 +C Intel IBR ? +C Intel HWL 9.5 +C Intel BWL 8.7 +C Intel SKL 8.6 +C Intel atom 18.9 +C Intel SLM 14.0 +C Intel GLM 9.8 +C Intel GLM+ 8.8 +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 8) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%rcx') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + LEA( ctz_table, %r10) + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + and $MASK, R32(t0) + movzbl (%r10,t0), R32(cnt) + jz L(count_better) +C Rightshift (u1,,u0) into (u1,,u0) +L(shr): shr R8(cnt), u0 + mov u1, t1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(count_better): + rep;bsf u0, cnt C tzcnt! + jmp L(shr) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/gmp-mparam.h new file mode 100644 index 0000000..db94fb7 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/gmp-mparam.h @@ -0,0 +1,217 @@ +/* AMD K8-K10 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 15 + +#define MUL_TOOM22_THRESHOLD 27 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 234 +#define MUL_TOOM6H_THRESHOLD 418 +#define MUL_TOOM8H_THRESHOLD 466 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 160 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 175 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 36 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 327 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define POWM_SEC_TABLE 2,67,322,991 + +#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 570, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 25, 8}, { 13, 7}, { 29, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \ + { 23, 7}, { 47, 8}, { 25, 7}, { 51, 8}, \ + { 29, 9}, { 15, 8}, { 37, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 55,10}, { 15, 9}, { 43,10}, { 23, 9}, \ + { 55,10}, { 31, 9}, { 63, 5}, { 1023, 4}, \ + { 2431, 5}, { 1279, 6}, { 671, 7}, { 367, 8}, \ + { 189, 9}, { 95, 8}, { 195, 9}, { 111,11}, \ + { 31, 9}, { 131,10}, { 71, 9}, { 155,10}, \ + { 79, 9}, { 159,10}, { 87,11}, { 47,10}, \ + { 111,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 143,10}, { 287,11}, { 159,10}, \ + { 319,11}, { 175,12}, { 95,11}, { 207,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 335,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 447,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \ + { 607,11}, { 1215,13}, { 319,12}, { 671,11}, \ + { 1343,12}, { 735,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 799,11}, { 1599,12}, { 831,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1599,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,14}, { 1279,13}, { 2687,14}, { 1407,15}, \ + { 767,14}, { 1535,13}, { 3071,14}, { 1791,16}, \ + { 511,15}, { 1023,14}, { 2431,15}, { 1279,14}, \ + { 2815,15}, { 1535,14}, { 3199,15}, { 1791,14}, \ + { 3583,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 185 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 27, 7}, { 14, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 29, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \ + { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 83,10}, { 47, 6}, { 767, 4}, \ + { 3263, 5}, { 1727, 4}, { 3455, 5}, { 1791, 6}, \ + { 927, 7}, { 479, 6}, { 959, 7}, { 511, 8}, \ + { 271, 9}, { 147,10}, { 87,11}, { 47,10}, \ + { 95,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \ + { 1663,13}, { 895,12}, { 1791,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,14}, { 639,13}, { 1471,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1855,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2303,14}, \ + { 1279,13}, { 2559,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3071,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2303,15}, { 1279,14}, { 2687,15}, \ + { 1535,14}, { 3199,15}, { 1791,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 203 +#define SQR_FFT_THRESHOLD 5248 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 15604 + +#define DC_DIV_QR_THRESHOLD 56 +#define DC_DIVAPPR_Q_THRESHOLD 220 +#define DC_BDIV_QR_THRESHOLD 52 +#define DC_BDIV_Q_THRESHOLD 152 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 214 + +#define BINV_NEWTON_THRESHOLD 327 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1895 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 106 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1718 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 125 +#define HGCD_APPR_THRESHOLD 173 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 555 +#define GCDEXT_DC_THRESHOLD 478 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 28 +#define SET_STR_DC_THRESHOLD 248 +#define SET_STR_PRECOMPUTE_THRESHOLD 1648 + +#define FAC_DSC_THRESHOLD 1075 +#define FAC_ODD_THRESHOLD 0 /* always */ diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm new file mode 100644 index 0000000..06c5d5d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/aorrlsh_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addlsh_n, mpn_rsblsh_n, optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/k8/aorrlsh_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm new file mode 100644 index 0000000..1818f9f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/aors_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreihwl/aors_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm new file mode 100644 index 0000000..9c5f631 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/aorsmul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) +include_mpn(`x86_64/bd1/aorsmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h new file mode 100644 index 0000000..531521d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/gmp-mparam.h @@ -0,0 +1,264 @@ +/* Intel Goldmont gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */ +/* FFT tuning limit = 468,030,122 */ +/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 5 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 38 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 17 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define DIV_1_VS_MUL_1_PERCENT 301 + +#define MUL_TOOM22_THRESHOLD 23 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 178 +#define MUL_TOOM6H_THRESHOLD 258 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 131 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 113 +#define SQR_TOOM4_THRESHOLD 290 +#define SQR_TOOM6_THRESHOLD 351 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 440 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 440, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 24, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,10}, \ + { 191,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 367,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 543,11}, { 1087,12}, { 607,13}, { 319,12}, \ + { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1279,11}, { 2559,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \ + { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \ + { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \ + { 1407,12}, { 2815,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,12}, { 5887,15}, { 767,14}, { 1535,13}, \ + { 3071,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,13}, { 3583,14}, { 1919,13}, { 3839,16}, \ + { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,12}, { 15359,14}, { 3967,16}, \ + { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,13}, { 15359,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8703,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \ + { 15359,15}, { 7935,17}, { 2047,16}, { 4095,15}, \ + { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \ + { 5119,15}, { 10239,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 261 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \ + { 639,12}, { 95,11}, { 191,10}, { 383,11}, \ + { 207,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 607,13}, { 319,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,13}, { 447,12}, { 895,11}, { 1791,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \ + { 2431,14}, { 639,13}, { 1279,12}, { 2559,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,12}, { 2943,11}, { 5887,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ + { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3583,13}, { 7167,14}, { 3839,13}, { 7679,12}, \ + { 15359,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5119,15}, \ + { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \ + { 3071,14}, { 6143,15}, { 3327,14}, { 6911,15}, \ + { 3583,14}, { 7167,15}, { 3839,14}, { 7679,13}, \ + { 15359,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8191,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 259 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 13 +#define SQRLO_SQR_THRESHOLD 7035 + +#define DC_DIV_QR_THRESHOLD 51 +#define DC_DIVAPPR_Q_THRESHOLD 183 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 204 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_2_THRESHOLD 28 +#define REDC_2_TO_REDC_N_THRESHOLD 54 + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1620 +#define MUPI_DIV_QR_THRESHOLD 83 +#define MU_BDIV_QR_THRESHOLD 1334 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,16,194,642 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1042 + +#define FAC_DSC_THRESHOLD 218 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 1 /* 6.58% faster than 3 */ +#define HGCD_THRESHOLD 136 +#define HGCD_APPR_THRESHOLD 168 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 393 +#define JACOBI_BASE_METHOD 4 /* 1.17% faster than 3 */ + +/* Tuneup completed successfully, took 800192 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm new file mode 100644 index 0000000..ed1ec54 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/mul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/coreisbr/mul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm b/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm new file mode 100644 index 0000000..1192635 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/goldmont/redc_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_redc_1) +include_mpn(`x86_64/k8/redc_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/invert_limb.asm b/gmp-6.3.0/mpn/x86_64/invert_limb.asm new file mode 100644 index 0000000..b375ad3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/invert_limb.asm @@ -0,0 +1,112 @@ +dnl AMD64 mpn_invert_limb -- Invert a normalized limb. + +dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller. + +dnl Copyright 2004, 2007-2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb (approx) div +C AMD K8,K9 48 71 +C AMD K10 48 77 +C Intel P4 135 161 +C Intel core2 69 116 +C Intel corei 55 89 +C Intel atom 129 191 +C VIA nano 79 157 + +C rax rcx rdx rdi rsi r8 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +PROTECT(`mpn_invert_limb_table') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_invert_limb) C Kn C2 Ci + FUNC_ENTRY(1) + mov %rdi, %rax C 0 0 0 + shr $55, %rax C 1 1 1 +ifdef(`DARWIN',` + lea mpn_invert_limb_table(%rip), %r8 + add $-512, %r8 +',` + lea -512+mpn_invert_limb_table(%rip), %r8 +') + movzwl (%r8,%rax,2), R32(%rcx) C %rcx = v0 + + C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1 + mov %rdi, %rsi C 0 0 0 + mov R32(%rcx), R32(%rax) C 4 5 5 + imul R32(%rcx), R32(%rcx) C 4 5 5 + shr $24, %rsi C 1 1 1 + inc %rsi C %rsi = d40 + imul %rsi, %rcx C 8 10 8 + shr $40, %rcx C 12 15 11 + sal $11, R32(%rax) C 5 6 6 + dec R32(%rax) + sub R32(%rcx), R32(%rax) C %rax = v1 + + C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47) + mov $0x1000000000000000, %rcx + imul %rax, %rsi C 14 17 13 + sub %rsi, %rcx + imul %rax, %rcx + sal $13, %rax + shr $47, %rcx + add %rax, %rcx C %rcx = v2 + + C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65) + mov %rdi, %rsi C 0 0 0 + shr %rsi C d/2 + sbb %rax, %rax C -d0 = -(d mod 2) + sub %rax, %rsi C d63 = ceil(d/2) + imul %rcx, %rsi C v2 * d63 + and %rcx, %rax C v2 * d0 + shr %rax C (v2>>1) * d0 + sub %rsi, %rax C (v2>>1) * d0 - v2 * d63 + mul %rcx + sal $31, %rcx + shr %rdx + add %rdx, %rcx C %rcx = v3 + + mov %rdi, %rax + mul %rcx + add %rdi, %rax + mov %rcx, %rax + adc %rdi, %rdx + sub %rdx, %rax + + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm b/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm new file mode 100644 index 0000000..739d59e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/invert_limb_table.asm @@ -0,0 +1,50 @@ +dnl Table used for mpn_invert_limb + +dnl Contributed to the GNU project by Torbjorn Granlund and Niels Möller. + +dnl Copyright 2004, 2007-2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +PROTECT(`mpn_invert_limb_table') + +ASM_START() +C Table entry X contains floor (0x7fd00 / (0x100 + X)) + + RODATA + ALIGN(2) + GLOBL mpn_invert_limb_table +mpn_invert_limb_table: +forloop(i,256,512-1,dnl +` .value eval(0x7fd00/i) +')dnl +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm new file mode 100644 index 0000000..f58b4cc --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/gcd_22.asm @@ -0,0 +1,142 @@ +dnl AMD64 mpn_gcd_22. Assumes useful bsf, useless shrd, no tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 7.4 +C AMD bd1 9.9 +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM 9.2 +C Intel WSM 9.0 +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + bsf t0, cnt + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovnc u1, t1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + shr R8(cnt), u0 + mov t1, u1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test u1, u1 + jnz L(top) + test v1, v1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + bsf t0, cnt + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h new file mode 100644 index 0000000..349bace --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/gmp-mparam.h @@ -0,0 +1,248 @@ +/* AMD K10 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#if 0 +#undef mpn_sublsh_n +#define mpn_sublsh_n(rp,up,vp,n,c) \ + (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \ + : MPN(mpn_sublsh_n)(rp,up,vp,n,c)) +#endif + +/* 3200-3600 MHz K10 Thuban */ +/* FFT tuning limit = 427,161,280 */ +/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 28 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 15 + +#define DIV_1_VS_MUL_1_PERCENT 324 + +#define MUL_TOOM22_THRESHOLD 27 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 232 +#define MUL_TOOM6H_THRESHOLD 363 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 155 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 145 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 142 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 280 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 34 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 530, 5}, { 24, 6}, { 13, 5}, { 27, 6}, \ + { 27, 7}, { 14, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 36, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \ + { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 35, 8}, { 71, 9}, \ + { 39, 8}, { 81, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 87,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 87,11}, { 47,10}, \ + { 111,12}, { 31,11}, { 63,10}, { 143,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191,11}, \ + { 111,12}, { 63,11}, { 143,10}, { 287,11}, \ + { 159,12}, { 95,11}, { 207,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 575,10}, \ + { 1151,11}, { 607,12}, { 319,11}, { 671,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 447,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1471,14}, \ + { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \ + { 1663,13}, { 959,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,14}, { 639,13}, { 1471,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,14}, { 1279,13}, { 2559,14}, { 1407,15}, \ + { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2431,15}, { 1279,14}, { 2943,15}, { 1535,14}, \ + { 3199,15}, { 1791,14}, { 3583,16}, { 1023,15}, \ + { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,16}, { 1535,15}, \ + { 3071,14}, { 6271,15}, { 3327,14}, { 6911,15}, \ + { 3583,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7167,17}, { 2047,16}, { 4095,15}, { 8959,16}, \ + { 4607,15}, { 9983,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 6143,15}, { 12543,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 207 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 29, 7}, { 28, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 21, 7}, { 43, 8}, \ + { 23, 7}, { 47, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 49, 9}, { 27, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 95,10}, \ + { 55,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 103,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319,12}, { 95,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 799,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,12}, { 703,11}, \ + { 1407,12}, { 735,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1663,13}, { 447,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 895,12}, \ + { 1791,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1471,14}, { 767,13}, { 1727,14}, { 895,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,14}, { 1279,13}, { 2559,14}, \ + { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2303,15}, { 1279,14}, { 2815,15}, \ + { 1535,14}, { 3199,15}, { 1791,16}, { 1023,15}, \ + { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,16}, { 1535,15}, \ + { 3071,14}, { 6271,15}, { 3327,14}, { 6911,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,16}, { 5119,15}, { 10495,16}, { 5631,15}, \ + { 11775,17}, { 3071,16}, { 6143,15}, { 12287,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 224 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 10950 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 238 +#define DC_BDIV_QR_THRESHOLD 54 +#define DC_BDIV_Q_THRESHOLD 42 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 230 + +#define BINV_NEWTON_THRESHOLD 327 +#define REDC_1_TO_REDC_2_THRESHOLD 25 +#define REDC_2_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1620 +#define MU_DIVAPPR_Q_THRESHOLD 1620 +#define MUPI_DIV_QR_THRESHOLD 104 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1652 + +#define POWM_SEC_TABLE 1,22,321,473,2144 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 248 +#define SET_STR_PRECOMPUTE_THRESHOLD 1304 + +#define FAC_DSC_THRESHOLD 470 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 5 /* 8.38% faster than 4 */ +#define HGCD_THRESHOLD 115 +#define HGCD_APPR_THRESHOLD 146 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 535 +#define GCDEXT_DC_THRESHOLD 460 +#define JACOBI_BASE_METHOD 1 /* 0.90% faster than 4 */ + +/* Tuneup completed successfully, took 448763 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm b/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm new file mode 100644 index 0000000..f70494a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/hamdist.asm @@ -0,0 +1,109 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2008, 2010-2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 2.0 = +C AMD bd1 ~4.4 = +C AMD bd2 ~4.4 = +C AMD bd3 +C AMD bd4 +C AMD bobcat 7.55 = +C AMD jaguar 2.52 - +C Intel P4 - +C Intel core2 - +C Intel NHM 2.03 + +C Intel SBR 2.01 + +C Intel IBR 1.96 + +C Intel HWL 1.64 = +C Intel BWL 1.56 - +C Intel SKL 1.52 = +C Intel atom +C Intel SLM 3.0 - +C VIA nano + +define(`ap', `%rdi') +define(`bp', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + FUNC_ENTRY(3) + mov (ap), %r8 + xor (bp), %r8 + + lea (ap,n,8), ap C point at A operand end + lea (bp,n,8), bp C point at B operand end + neg n + + test $1, R8(n) + jz L(2) + +L(1): .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax + xor R32(%r10), R32(%r10) + inc n + js L(top) + FUNC_EXIT() + ret + + ALIGN(16) +L(2): mov 8(ap,n,8), %r9 + .byte 0xf3,0x49,0x0f,0xb8,0xc0 C popcnt %r8, %rax + xor 8(bp,n,8), %r9 + .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10 + add $2, n + js L(top) + lea (%r10, %rax), %rax + FUNC_EXIT() + ret + + ALIGN(16) +L(top): mov (ap,n,8), %r8 + lea (%r10, %rax), %rax + mov 8(ap,n,8), %r9 + xor (bp,n,8), %r8 + xor 8(bp,n,8), %r9 + .byte 0xf3,0x49,0x0f,0xb8,0xc8 C popcnt %r8, %rcx + lea (%rcx, %rax), %rax + .byte 0xf3,0x4d,0x0f,0xb8,0xd1 C popcnt %r9, %r10 + add $2, n + js L(top) + + lea (%r10, %rax), %rax + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k10/lshift.asm b/gmp-6.3.0/mpn/x86_64/k10/lshift.asm new file mode 100644 index 0000000..cadf9b9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for AMD K10. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm new file mode 100644 index 0000000..48a92e5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for AMD K10. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k10/popcount.asm b/gmp-6.3.0/mpn/x86_64/k10/popcount.asm new file mode 100644 index 0000000..3814aea --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/popcount.asm @@ -0,0 +1,138 @@ +dnl AMD64 mpn_popcount -- population count. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 1.125 +C Intel P4 n/a +C Intel core2 n/a +C Intel corei 1.25 +C Intel atom n/a +C VIA nano n/a + +C * The zero-offset of popcount is misassembled to the offset-less form, which +C is one byte shorter and therefore will mess up the switching code. +C * The outdated gas used in FreeBSD and NetBSD cannot handle the POPCNT insn, +C which is the main reason for our usage of '.byte'. + +C TODO +C * Improve switching code, the current code sucks. + +define(`up', `%rdi') +define(`n', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + FUNC_ENTRY(2) + +ifelse(1,1,` + lea (up,n,8), up + +C mov R32(n), R32(%rcx) +C neg R32(%rcx) + imul $-1, R32(n), R32(%rcx) + and $8-1, R32(%rcx) + + neg n + + mov R32(%rcx), R32(%rax) + neg %rax + lea (up,%rax,8),up + + xor R32(%rax), R32(%rax) + + lea (%rcx,%rcx,4), %rcx + + lea L(top)(%rip), %rdx + lea (%rdx,%rcx,2), %rdx + jmp *%rdx +',` + lea (up,n,8), up + + mov R32(n), R32(%rcx) + neg R32(%rcx) + and $8-1, R32(%rcx) + + neg n + + mov R32(%rcx), R32(%rax) + shl $3, R32(%rax) + sub %rax, up + + xor R32(%rax), R32(%rax) + +C add R32(%rcx), R32(%rcx) C 2x +C lea (%rcx,%rcx,4), %rcx C 10x + imul $10, R32(%rcx) + + lea L(top)(%rip), %rdx + add %rcx, %rdx + jmp *%rdx +') + + ALIGN(32) +L(top): +C 0 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x00 C popcnt 0(up,n,8), %r8 + add %r8, %rax +C 7 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x08 C popcnt 8(up,n,8), %r9 + add %r9, %rax +C 6 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x10 C popcnt 16(up,n,8), %r8 + add %r8, %rax +C 5 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x18 C popcnt 24(up,n,8), %r9 + add %r9, %rax +C 4 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x20 C popcnt 32(up,n,8), %r8 + add %r8, %rax +C 3 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x28 C popcnt 40(up,n,8), %r9 + add %r9, %rax +C 2 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x30 C popcnt 48(up,n,8), %r8 + add %r8, %rax +C 1 = n mod 8 + .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x38 C popcnt 56(up,n,8), %r9 + add %r9, %rax + + add $8, n + js L(top) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k10/rshift.asm b/gmp-6.3.0/mpn/x86_64/k10/rshift.asm new file mode 100644 index 0000000..249051a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for AMD K10. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k10/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm b/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm new file mode 100644 index 0000000..3e1898b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm @@ -0,0 +1,153 @@ +dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63. + +dnl Copyright 2008, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.167 +C AMD K10 2.167 +C Intel P4 12.0 +C Intel core2 4.0 +C Intel corei ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Perhaps handle various n mod 3 sizes better. The code now is too large. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp_param', `%rdx') +define(`n', `%rcx') +define(`u0', `%r8') +define(`v0', `%r9') + + +define(`bp', `%rbp') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addaddmul_1msb0) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbp + + lea (ap,n,8), ap + lea (bp_param,n,8), bp + lea (rp,n,8), rp + neg n + + mov (ap,n,8), %rax + mul %r8 + mov %rax, %r11 + mov (bp,n,8), %rax + mov %rdx, %r10 + add $3, n + jns L(end) + + push %r13 + + ALIGN(16) +L(top): mul %r9 + add %rax, %r11 + mov -16(ap,n,8), %rax + adc %rdx, %r10 + mov %r11, -24(rp,n,8) + mul %r8 + add %rax, %r10 + mov -16(bp,n,8), %rax + mov $0, R32(%r13) + adc %rdx, %r13 + mul %r9 + add %rax, %r10 + mov -8(ap,n,8), %rax + adc %rdx, %r13 + mov %r10, -16(rp,n,8) + mul %r8 + add %rax, %r13 + mov -8(bp,n,8), %rax + mov $0, R32(%r11) + adc %rdx, %r11 + mul %r9 + add %rax, %r13 + adc %rdx, %r11 + mov (ap,n,8), %rax + mul %r8 + add %rax, %r11 + mov %r13, -8(rp,n,8) + mov (bp,n,8), %rax + mov $0, R32(%r10) + adc %rdx, %r10 + add $3, n + js L(top) + + pop %r13 + +L(end): mul %r9 + add %rax, %r11 + adc %rdx, %r10 + cmp $1, R32(n) + ja L(two) + mov -16(ap,n,8), %rax + mov %r11, -24(rp,n,8) + mov %r10, %r11 + jz L(one) + +L(nul): mul %r8 + add %rax, %r10 + mov -16(bp), %rax + mov $0, R32(%r11) + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + mov -8(ap), %rax + adc %rdx, %r11 + mov %r10, -16(rp) +L(one): mul %r8 + add %rax, %r11 + mov -8(bp), %rax + mov $0, R32(%r10) + adc %rdx, %r10 + mul %r9 + add %rax, %r11 + adc %rdx, %r10 + +L(two): mov %r11, -8(rp) + mov %r10, %rax +L(ret): pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm new file mode 100644 index 0000000..78bcba1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and +dnl add the result to a third limb vector. + +dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cfg cycles/limb am1+am1 +C AMD K8,K9 2.375 +C AMD K10 2.375 +C AMD bull 5.2 <- 4.6-4.75 bad +C AMD pile 4.96 <- 4.6-4.75 bad +C AMD steam ? +C AMD excavator ? +C AMD bobcat 5.75 5.0 bad +C AMD jaguar 5.9 5.2-5.4 bad +C Intel P4 15-16 +C Intel core2 4.5 4.25-4.5 bad +C Intel NHM 4.33 4.55 bad +C Intel SBR 3.4 2.93 3.24 bad +C Intel IBR 3.35 2.6 2.95 bad +C Intel HWL 3.3 2.15 2.3 bad +C Intel BWL 2.33 2.33 1.65 bad +C Intel SKL 2.37 2.21 1.64 bad +C Intel atom 20 18.7 +C Intel SLM 8 8.5 +C VIA nano 4.4 + +C This code is the result of running a code generation and optimization tool +C suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Tune feed-in and wind-down code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + mov n_param, n + push %rbx + push %rbp + + mov 0(vp), v0 + mov 8(vp), v1 + + mov R32(n_param), R32(%rbx) + mov (up), %rax + lea -8(up,n_param,8), up + lea -8(rp,n_param,8), rp + mul v0 + neg n + and $3, R32(%rbx) + jz L(b0) + cmp $2, R32(%rbx) + jc L(b1) + jz L(b2) + +L(b3): mov %rax, w1 + mov %rdx, w2 + xor R32(w3), R32(w3) + mov 8(up,n,8), %rax + dec n + jmp L(lo3) + +L(b2): mov %rax, w2 + mov 8(up,n,8), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + add $-2, n + jmp L(lo2) + +L(b1): mov %rax, w3 + mov 8(up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + inc n + jmp L(lo1) + +L(b0): mov $0, R32(w3) + mov %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + jmp L(lo0) + + ALIGN(32) +L(top): mov $0, R32(w1) + mul v0 + add %rax, w3 + mov (up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(lo1): mul v1 + add w3, (rp,n,8) + mov $0, R32(w3) + adc %rax, w0 + mov $0, R32(w2) + mov 8(up,n,8), %rax + adc %rdx, w1 + mul v0 + add %rax, w0 + mov 8(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add w0, 8(rp,n,8) + adc %rax, w1 + adc %rdx, w2 + mov 16(up,n,8), %rax + mul v0 + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 16(up,n,8), %rax +L(lo3): mul v1 + add w1, 16(rp,n,8) + adc %rax, w2 + adc %rdx, w3 + xor R32(w0), R32(w0) + mov 24(up,n,8), %rax + mul v0 + add %rax, w2 + mov 24(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(lo2): mul v1 + add w2, 24(rp,n,8) + adc %rax, w3 + adc %rdx, w0 + mov 32(up,n,8), %rax + add $4, n + js L(top) + +L(end): xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov (up), %rax + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w3, (rp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm new file mode 100644 index 0000000..ff3a184 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm @@ -0,0 +1,217 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.87 < 3.85 for lshift + add_n +C AMD K10 2.75 < 3.85 for lshift + add_n +C Intel P4 22 > 7.33 for lshift + add_n +C Intel core2 4.1 > 3.27 for lshift + add_n +C Intel NHM 4.4 > 3.75 for lshift + add_n +C Intel SBR 3.17 < 3.46 for lshift + add_n +C Intel atom ? ? 8.75 for lshift + add_n +C VIA nano 4.7 < 6.25 for lshift + add_n + +C TODO +C * Can we propagate carry into rdx instead of using a special carry register? +C That could save enough insns to get to 10 cycles/iteration. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n_param', `%rcx') +define(`cnt', `%r8') + +define(`vp', `%r12') +define(`n', `%rbp') + +ifdef(`OPERATION_addlsh_n',` + define(ADDSUB, `add') + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADDSUB, `sub') + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %r12 + push %rbp + push %rbx + + mov (vp_param), %rax C load first V limb early + + mov $0, R32(n) + sub n_param, n + + lea -16(up,n_param,8), up + lea -16(rp,n_param,8), rp + lea 16(vp_param,n_param,8), vp + + mov n_param, %r9 + + mov %r8, %rcx + mov $1, R32(%r8) + shl R8(%rcx), %r8 + + mul %r8 C initial multiply + + and $3, R32(%r9) + jz L(b0) + cmp $2, R32(%r9) + jc L(b1) + jz L(b2) + +L(b3): mov %rax, %r11 + ADDSUB 16(up,n,8), %r11 + mov -8(vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (vp,n,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, n + jnz L(lo3) + jmp L(cj3) + +L(b2): mov %rax, %rbx + mov -8(vp,n,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, n + jz L(cj2) + mov %rdx, %r10 + mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + xor R32(%rcx), R32(%rcx) C clear carry register + jmp L(lo2) + +L(b1): mov %rax, %r9 + mov %rdx, %r10 + add $1, n + jnz L(gt1) + ADDSUB 8(up,n,8), %r9 + jmp L(cj1) +L(gt1): mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + ADDSUB 8(up,n,8), %r9 + ADCSBB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + jmp L(lo1) + +L(b0): mov %rax, %r10 + mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + ADDSUB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(vp,n,8), %rax + add $4, n + jz L(end) + + ALIGN(8) +L(top): mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(rp,n,8) +L(lo3): mov %rdx, %r10 + mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(rp,n,8) +L(lo2): mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + add R32(%rcx), R32(%rcx) + ADCSBB (up,n,8), %rbx + ADCSBB 8(up,n,8), %r9 + ADCSBB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rbx, (rp,n,8) +L(lo1): mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(rp,n,8) +L(lo0): mov 8(vp,n,8), %rax + add $4, n + jnz L(top) + +L(end): mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(rp,n,8) +L(cj3): mov %r11, -8(rp,n,8) +L(cj2): add R32(%rcx), R32(%rcx) + ADCSBB (up,n,8), %rbx + ADCSBB 8(up,n,8), %r9 + mov %rbx, (rp,n,8) +L(cj1): mov %r9, 8(rp,n,8) + mov %rdx, %rax + ADCSBB $0, %rax + pop %rbx + pop %rbp + pop %r12 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm new file mode 100644 index 0000000..1172b0d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm @@ -0,0 +1,179 @@ +dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor, +dnl returning quotient only. + +dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm/unorm +C AMD K8,K9 10 + +C AMD K10 10 + +C AMD bull 13.7 - +C AMD pile 13.7 + +C AMD steam +C AMD excavator +C AMD bobcat 15 - +C AMD jaguar 16 - +C Intel P4 33 = +C Intel core2 13.25 = +C Intel NHM 14 = +C Intel SBR 8.5 - +C Intel IBR 8.5 - +C Intel HWL 8 = +C Intel BWL 8 = +C Intel SKL 8 = +C Intel atom 42 -- +C Intel SLM 20.4 -- +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`d', `%rcx') +define(`di', `%r8') C just mpn_pi1_bdiv_q_1 +define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C ncnt count + mov %rdx, %r10 + + bt $0, R32(%rax) + jnc L(evn) C skip bsf unless divisor is even + +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r8 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits + + jmp L(pi1) + +L(evn): bsf %rax, %rcx + shr R8(%rcx), %rax + jmp L(odd) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbx + + mov %rcx, %r11 C d + mov %rdx, %r10 C n + mov %r9, %rcx C ncnt + +L(pi1): mov (up), %rax C up[0] + + dec %r10 + jz L(one) + + mov 8(up), %rdx C up[1] + lea (up,%r10,8), up C up end + lea (rp,%r10,8), rp C rp end + neg %r10 C -n + + shrd R8(%rcx), %rdx, %rax + + xor R32(%rbx), R32(%rbx) + jmp L(ent) + + ALIGN(8) +L(top): + C rax q + C rbx carry bit, 0 or 1 + C rcx ncnt + C rdx + C r10 counter, limbs, negative + C r11 d + + mul %r11 C carry limb in rdx + mov (up,%r10,8), %rax + mov 8(up,%r10,8), %r9 + shrd R8(%rcx), %r9, %rax + nop + sub %rbx, %rax C apply carry bit + setc R8(%rbx) + sub %rdx, %rax C apply carry limb + adc $0, R32(%rbx) +L(ent): imul %r8, %rax + mov %rax, (rp,%r10,8) + inc %r10 + jnz L(top) + + mul %r11 C carry limb in rdx + mov (up), %rax C up high limb + shr R8(%rcx), %rax + sub %rbx, %rax C apply carry bit + sub %rdx, %rax C apply carry limb + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm b/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm new file mode 100644 index 0000000..86de08c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm @@ -0,0 +1,249 @@ +dnl x86-64 mpn_div_qr_1n_pi1 +dnl -- Divide an mpn number by a normalized single-limb number, +dnl using a single-limb inverse. + +dnl Contributed to the GNU project by Niels Möller + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C AMD K8,K9 11 +C AMD K10 11 +C AMD bull 16 +C AMD pile 14.25 +C AMD steam ? +C AMD bobcat 16 +C AMD jaguar ? +C Intel P4 47.5 poor +C Intel core 28.5 very poor +C Intel NHM 29 very poor +C Intel SBR 16 poor +C Intel IBR 13.5 +C Intel HWL 12 +C Intel BWL ? +C Intel atom 53 very poor +C VIA nano 19 + + +C INPUT Parameters +define(`QP', `%rdi') +define(`UP', `%rsi') +define(`UN_INPUT', `%rdx') +define(`U1', `%rcx') C Also in %rax +define(`D', `%r8') +define(`DINV', `%r9') + +C Invariants +define(`B2', `%rbp') +define(`B2md', `%rbx') + +C Variables +define(`UN', `%r8') C Overlaps D input +define(`T', `%r10') +define(`U0', `%r11') +define(`U2', `%r12') +define(`Q0', `%r13') +define(`Q1', `%r14') +define(`Q2', `%r15') + +ABI_SUPPORT(STD64) + + ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_div_qr_1n_pi1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + dec UN_INPUT + jnz L(first) + + C Just a single 2/1 division. + C T, U0 are allocated in scratch registers + lea 1(U1), T + mov U1, %rax + mul DINV + mov (UP), U0 + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(single_div_done) + sub D, %rax + add $1, T +L(single_div_done): + mov T, (QP) + FUNC_EXIT() + ret +L(first): + C FIXME: Could delay some of these until we enter the loop. + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + + mov D, B2 + imul DINV, B2 + neg B2 + mov B2, B2md + sub D, B2md + + C D not needed until final reduction + push D + mov UN_INPUT, UN C Clobbers D + + mov DINV, %rax + mul U1 + mov %rax, Q0 + add U1, %rdx + mov %rdx, T + + mov B2, %rax + mul U1 + mov -8(UP, UN, 8), U0 + mov (UP, UN, 8), U1 + mov T, (QP, UN, 8) + add %rax, U0 + adc %rdx, U1 + sbb U2, U2 + dec UN + mov U1, %rax + jz L(final) + mov $0, R32(Q1) + + ALIGN(16) + + C Loop is 28 instructions, 30 K8/K10 decoder slots, should run + C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1 + C is zero, and carry holds an extra copy of U2. +L(loop): + C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2 + C Remains to add in B (U1 + c) + cmovc DINV, Q1 + mov U2, Q2 + neg Q2 + mul DINV + add %rdx, Q1 + adc $0, Q2 + add Q0, Q1 + mov %rax, Q0 + mov B2, %rax + lea (B2md, U0), T + adc $0, Q2 + + C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u + mul U1 + and B2, U2 + add U2, U0 + cmovnc U0, T + + C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c + adc U1, Q1 + mov -8(UP, UN, 8), U0 + adc Q2, 8(QP, UN, 8) + jc L(q_incr) +L(q_incr_done): + add %rax, U0 + mov T, %rax + adc %rdx, %rax + mov Q1, (QP, UN, 8) + mov $0, R32(Q1) + sbb U2, U2 + dec UN + mov %rax, U1 + jnz L(loop) + +L(final): + pop D + + mov U2, Q1 + and D, U2 + sub U2, %rax + neg Q1 + + mov %rax, U1 + sub D, %rax + cmovc U1, %rax + sbb $-1, Q1 + + lea 1(%rax), T + mul DINV + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(div_done) + sub D, %rax + add $1, T +L(div_done): + add T, Q0 + mov Q0, (QP) + adc Q1, 8(QP) + jnc L(done) +L(final_q_incr): + addq $1, 16(QP) + lea 8(QP), QP + jc L(final_q_incr) + +L(done): + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(q_incr): + C U1 is not live, so use it for indexing + lea 16(QP, UN, 8), U1 +L(q_incr_loop): + addq $1, (U1) + jnc L(q_incr_done) + lea 8(U1), U1 + jmp L(q_incr_loop) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h new file mode 100644 index 0000000..d87cc3b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h @@ -0,0 +1,237 @@ +/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#if 0 +#undef mpn_sublsh_n +#define mpn_sublsh_n(rp,up,vp,n,c) \ + (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \ + : MPN(mpn_sublsh_n)(rp,up,vp,n,c)) +#endif + +/* 2500 MHz K8 Brisbane */ +/* FFT tuning limit = 115,768,433 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 16 + +#define DIV_1_VS_MUL_1_PERCENT 309 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 232 +#define MUL_TOOM6H_THRESHOLD 324 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 114 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 430 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 654 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 654, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 27, 7}, { 14, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 44, 8}, { 23, 7}, { 47, 8}, \ + { 25, 7}, { 51, 8}, { 31, 7}, { 63, 8}, \ + { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 53, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \ + { 67, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \ + { 81, 9}, { 43,10}, { 23, 9}, { 55, 8}, \ + { 111,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 87,10}, { 47, 9}, { 99,10}, { 55, 9}, \ + { 111,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 71, 9}, { 147,10}, { 87,11}, { 47,10}, \ + { 111,11}, { 63,10}, { 143,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 199,11}, { 111,12}, \ + { 63,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1215,14}, \ + { 639,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1855,15}, { 511,14}, { 1023,13}, \ + { 2047,14}, { 1151,13}, { 2367,14}, { 1407,15}, \ + { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2303,15}, { 1279,14}, { 2687,15}, { 1535,14}, \ + { 3199,15}, { 1791,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4735,15}, { 2559,16}, \ + { 1535,15}, { 3071,14}, { 6271,15}, { 3327,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 183 +#define MUL_FFT_THRESHOLD 11520 + +#define SQR_FFT_MODF_THRESHOLD 540 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 540, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 16, 6}, { 33, 7}, { 33, 8}, \ + { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \ + { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \ + { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55, 9}, { 31, 8}, \ + { 65, 9}, { 35, 8}, { 71, 9}, { 43,10}, \ + { 23, 9}, { 55,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ + { 55, 9}, { 111,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 87,11}, { 47,10}, { 111,12}, \ + { 31,11}, { 63,10}, { 143,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 127, 9}, { 511,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,10}, { 511, 9}, \ + { 1023,11}, { 271,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,11}, { 447,13}, \ + { 127,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,14}, { 127,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \ + { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,14}, { 511,13}, { 1215,14}, { 639,13}, \ + { 1471,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2111,14}, \ + { 1151,13}, { 2303,14}, { 1407,15}, { 767,14}, \ + { 1791,16}, { 511,15}, { 1023,14}, { 2303,15}, \ + { 1279,14}, { 2687,15}, { 1535,14}, { 3199,15}, \ + { 1791,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2559,16}, { 1535,15}, \ + { 3071,14}, { 6271,15}, { 3327,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 202 +#define SQR_FFT_THRESHOLD 7296 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 22239 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 14281 + +#define DC_DIV_QR_THRESHOLD 47 +#define DC_DIVAPPR_Q_THRESHOLD 266 +#define DC_BDIV_QR_THRESHOLD 38 +#define DC_BDIV_Q_THRESHOLD 104 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 250 + +#define BINV_NEWTON_THRESHOLD 258 +#define REDC_1_TO_REDC_2_THRESHOLD 35 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 99 +#define MU_BDIV_QR_THRESHOLD 1787 +#define MU_BDIV_Q_THRESHOLD 1895 + +#define POWM_SEC_TABLE 1,16,194,960,2825 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 248 +#define SET_STR_PRECOMPUTE_THRESHOLD 1747 + +#define FAC_DSC_THRESHOLD 1240 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 3 /* 4.10% faster than 5 */ +#define HGCD_THRESHOLD 141 +#define HGCD_APPR_THRESHOLD 181 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 622 +#define GCDEXT_DC_THRESHOLD 496 +#define JACOBI_BASE_METHOD 1 /* 0.97% faster than 3 */ + +/* Tuneup completed successfully, took 131832 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm new file mode 100644 index 0000000..ca2efb9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm @@ -0,0 +1,469 @@ +dnl AMD64 mpn_mul_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. + +dnl Copyright 2008, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.375 +C AMD K10 2.375 +C Intel P4 15-16 +C Intel core2 4.45 +C Intel corei 4.35 +C Intel atom ? +C VIA nano 4.5 + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Use fewer registers. (how??? I can't see it -- david) +C * Avoid some "mov $0,r" and instead use "xor r,r". +C * Can the top of each L(addmul_outer_n) prologue be folded into the +C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the +C case where vn = 1 or 2; is it worth it? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%r15') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n', `%r11') +define(`outer_addr', `%r14') +define(`un', `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + xor R32(un), R32(un) + mov (up), %rax + mov (vp), v0 + + sub un_param, un C rdx used by mul + mov un, n + mov R32(un_param), R32(w0) + + lea (rp,un_param,8), rp + lea (up,un_param,8), up + + mul v0 + + test $1, R8(vn) + jz L(mul_2) + +C =========================================================== +C mul_1 for vp[0] if vn is odd + +L(mul_1): + and $3, R32(w0) + jz L(mul_1_prologue_0) + cmp $2, R32(w0) + jc L(mul_1_prologue_1) + jz L(mul_1_prologue_2) + +L(mul_1_prologue_3): + add $-1, n + lea L(addmul_outer_3)(%rip), outer_addr + mov %rax, w3 + mov %rdx, w0 + jmp L(mul_1_entry_3) + +L(mul_1_prologue_0): + mov %rax, w2 + mov %rdx, w3 C note: already w0 == 0 + lea L(addmul_outer_0)(%rip), outer_addr + jmp L(mul_1_entry_0) + +L(mul_1_prologue_1): + cmp $-1, un + jne 2f + mov %rax, -8(rp) + mov %rdx, (rp) + jmp L(ret) +2: add $1, n + lea L(addmul_outer_1)(%rip), outer_addr + mov %rax, w1 + mov %rdx, w2 + xor R32(w3), R32(w3) + mov (up,n,8), %rax + jmp L(mul_1_entry_1) + +L(mul_1_prologue_2): + add $-2, n + lea L(addmul_outer_2)(%rip), outer_addr + mov %rax, w0 + mov %rdx, w1 + mov 24(up,n,8), %rax + xor R32(w2), R32(w2) + xor R32(w3), R32(w3) + jmp L(mul_1_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,n,8) + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 +L(mul_1_entry_1): + xor R32(w0), R32(w0) + mul v0 + mov w1, -8(rp,n,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,n,8), %rax + mul v0 + mov w2, (rp,n,8) + add %rax, w3 + adc %rdx, w0 +L(mul_1_entry_3): + mov 16(up,n,8), %rax + mul v0 + mov w3, 8(rp,n,8) + xor R32(w2), R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,n,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, n + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + adc %rdx, w2 + mov w2, (rp) + + add $-1, vn C vn -= 1 + jz L(ret) + + mov 8(vp), v0 + mov 16(vp), v1 + + lea 8(vp), vp C vp += 1 + lea 8(rp), rp C rp += 1 + + jmp *outer_addr + +C =========================================================== +C mul_2 for vp[0], vp[1] if vn is even + + ALIGN(16) +L(mul_2): + mov 8(vp), v1 + + and $3, R32(w0) + jz L(mul_2_prologue_0) + cmp $2, R32(w0) + jz L(mul_2_prologue_2) + jc L(mul_2_prologue_1) + +L(mul_2_prologue_3): + lea L(addmul_outer_3)(%rip), outer_addr + add $2, n + mov %rax, -16(rp,n,8) + mov %rdx, w2 + xor R32(w3), R32(w3) + xor R32(w0), R32(w0) + mov -16(up,n,8), %rax + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_prologue_0): + add $3, n + mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov -24(up,n,8), %rax + lea L(addmul_outer_0)(%rip), outer_addr + jmp L(mul_2_entry_0) + + ALIGN(16) +L(mul_2_prologue_1): + mov %rax, w3 + mov %rdx, w0 + xor R32(w1), R32(w1) + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_2): + add $1, n + lea L(addmul_outer_2)(%rip), outer_addr + mov $0, R32(w0) + mov $0, R32(w1) + mov %rax, w2 + mov -8(up,n,8), %rax + mov %rdx, w3 + jmp L(mul_2_entry_2) + + C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(mul_2_top): + mov -32(up,n,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,n,8), %rax + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(mul_2_entry_0): + mul v1 + add %rax, w1 + mov w0, -24(rp,n,8) + adc %rdx, w2 + mov -16(up,n,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,n,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(rp,n,8) +L(mul_2_entry_3): + mul v1 + add %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + adc R32(w1), R32(w0) C adc $0, w0 +L(mul_2_entry_2): + mul v1 + add %rax, w3 + mov w2, -8(rp,n,8) + adc %rdx, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) +L(mul_2_entry_1): + add $4, n + mov w3, -32(rp,n,8) + js L(mul_2_top) + + mov -32(up,n,8), %rax C FIXME: n is constant + mul v1 + add %rax, w0 + mov w0, (rp) + adc %rdx, w1 + mov w1, 8(rp) + + add $-2, vn C vn -= 2 + jz L(ret) + + mov 16(vp), v0 + mov 24(vp), v1 + + lea 16(vp), vp C vp += 2 + lea 16(rp), rp C rp += 2 + + jmp *outer_addr + + +C =========================================================== +C addmul_2 for remaining vp's + + C in the following prologues, we reuse un to store the + C adjusted value of n that is reloaded on each iteration + +L(addmul_outer_0): + add $3, un + lea 0(%rip), outer_addr + + mov un, n + mov -24(up,un,8), %rax + mul v0 + mov %rax, w0 + mov -24(up,un,8), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + jmp L(addmul_entry_0) + +L(addmul_outer_1): + mov un, n + mov (up,un,8), %rax + mul v0 + mov %rax, w3 + mov (up,un,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + jmp L(addmul_entry_1) + +L(addmul_outer_2): + add $1, un + lea 0(%rip), outer_addr + + mov un, n + mov -8(up,un,8), %rax + mul v0 + xor R32(w0), R32(w0) + mov %rax, w2 + xor R32(w1), R32(w1) + mov %rdx, w3 + mov -8(up,un,8), %rax + jmp L(addmul_entry_2) + +L(addmul_outer_3): + add $2, un + lea 0(%rip), outer_addr + + mov un, n + mov -16(up,un,8), %rax + xor R32(w3), R32(w3) + mul v0 + mov %rax, w1 + mov -16(up,un,8), %rax + mov %rdx, w2 + jmp L(addmul_entry_3) + + C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(addmul_top): + add w3, -32(rp,n,8) + adc %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + adc R32(w2), R32(w2) C adc $0, w2 +L(addmul_entry_0): + mul v1 + xor R32(w3), R32(w3) + add w0, -24(rp,n,8) + adc %rax, w1 + mov -16(up,n,8), %rax + adc %rdx, w2 + mul v0 + add %rax, w1 + mov -16(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(addmul_entry_3): + mul v1 + add w1, -16(rp,n,8) + adc %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + mul v0 + xor R32(w0), R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov -8(up,n,8), %rax + adc R32(w1), R32(w0) C adc $0, w0 +L(addmul_entry_2): + mul v1 + add w2, -8(rp,n,8) + adc %rax, w3 + adc %rdx, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w3 + mov (up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(addmul_entry_1): + mul v1 + add $4, n + js L(addmul_top) + + add w3, -8(rp) + adc %rax, w0 + mov w0, (rp) + adc %rdx, w1 + mov w1, 8(rp) + + add $-2, vn C vn -= 2 + jz L(ret) + + lea 16(rp), rp C rp += 2 + lea 16(vp), vp C vp += 2 + + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + + ALIGN(16) +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm new file mode 100644 index 0000000..fa00f42 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm @@ -0,0 +1,436 @@ +dnl AMD64 mpn_mullo_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C NOTES +C * There is a major stupidity in that we call mpn_mul_1 initially, for a +C large trip count. Instead, we should start with mul_2 for any operand +C size congruence class. +C * Stop iterating addmul_2 earlier, falling into straight-line triangle code +C for the last 2-3 iterations. +C * Perhaps implement n=4 special code. +C * The reload of the outer loop jump address hurts branch prediction. +C * The addmul_2 loop ends with an MUL whose high part is not used upon loop +C exit. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`outer_addr', `%r8') +define(`j', `%r9') +define(`v0', `%r13') +define(`v1', `%r14') +define(`w0', `%rbx') +define(`w1', `%r15') +define(`w2', `%rbp') +define(`w3', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, n + jge L(gen) + mov (up), %rax C u0 + mov (vp_param), %r8 C v0 + + lea L(tab)(%rip), %r9 +ifdef(`PIC', +` movslq (%r9,%rcx,4), %r10 + add %r10, %r9 + jmp *%r9 +',` + jmp *(%r9,n,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(tab), L(tab)) C not allowed + JMPENT( L(1), L(tab)) C 1 + JMPENT( L(2), L(tab)) C 2 + JMPENT( L(3), L(tab)) C 3 +dnl JMPENT( L(0m4), L(tab)) C 4 +dnl JMPENT( L(1m4), L(tab)) C 5 +dnl JMPENT( L(2m4), L(tab)) C 6 +dnl JMPENT( L(3m4), L(tab)) C 7 +dnl JMPENT( L(0m4), L(tab)) C 8 +dnl JMPENT( L(1m4), L(tab)) C 9 +dnl JMPENT( L(2m4), L(tab)) C 10 +dnl JMPENT( L(3m4), L(tab)) C 11 + TEXT + +L(1): imul %r8, %rax + mov %rax, (rp) + FUNC_EXIT() + ret + +L(2): mov 8(vp_param), %r11 + imul %rax, %r11 C u0 x v1 + mul %r8 C u0 x v0 + mov %rax, (rp) + imul 8(up), %r8 C u1 x v0 + lea (%r11, %rdx), %rax + add %r8, %rax + mov %rax, 8(rp) + FUNC_EXIT() + ret + +L(3): mov 8(vp_param), %r9 C v1 + mov 16(vp_param), %r11 + mul %r8 C u0 x v0 -> + mov %rax, (rp) C r0 + mov (up), %rax C u0 + mov %rdx, %rcx C r1 + mul %r9 C u0 x v1 -> + imul 8(up), %r9 C u1 x v1 -> r2 + mov 16(up), %r10 + imul %r8, %r10 C u2 x v0 -> r2 + add %rax, %rcx + adc %rdx, %r9 + add %r10, %r9 + mov 8(up), %rax C u1 + mul %r8 C u1 x v0 -> + add %rax, %rcx + adc %rdx, %r9 + mov %r11, %rax + imul (up), %rax C u0 x v2 -> r2 + add %rax, %r9 + mov %rcx, 8(rp) + mov %r9, 16(rp) + FUNC_EXIT() + ret + +L(0m4): +L(1m4): +L(2m4): +L(3m4): +L(gen): push %rbx + push %rbp + push %r13 + push %r14 + push %r15 + + mov (up), %rax + mov (vp_param), v0 + mov vp_param, vp + + lea (rp,n,8), rp + lea (up,n,8), up + neg n + + mul v0 + + test $1, R8(n) + jz L(mul_2) + +L(mul_1): + lea -8(rp), rp + lea -8(up), up + test $2, R8(n) + jnz L(mul_1_prologue_3) + +L(mul_1_prologue_2): C n = 7, 11, 15, ... + lea -1(n), j + lea L(addmul_outer_1)(%rip), outer_addr + mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + xor R32(w3), R32(w3) + mov 16(up,n,8), %rax + jmp L(mul_1_entry_2) + +L(mul_1_prologue_3): C n = 5, 9, 13, ... + lea 1(n), j + lea L(addmul_outer_3)(%rip), outer_addr + mov %rax, w2 + mov %rdx, w3 + xor R32(w0), R32(w0) + jmp L(mul_1_entry_0) + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,j,8) + add %rax, w1 + mov (up,j,8), %rax + adc %rdx, w2 + xor R32(w0), R32(w0) + mul v0 + mov w1, -8(rp,j,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,j,8), %rax + mul v0 + mov w2, (rp,j,8) + add %rax, w3 + adc %rdx, w0 + mov 16(up,j,8), %rax + mul v0 + mov w3, 8(rp,j,8) + xor R32(w2), R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,j,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, j + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + adc %rdx, w2 + + imul (up), v0 + add v0, w2 + mov w2, (rp) + + add $1, n + jz L(ret) + + mov 8(vp), v0 + mov 16(vp), v1 + + lea 16(up), up + lea 8(vp), vp + lea 24(rp), rp + + jmp *outer_addr + + +L(mul_2): + mov 8(vp), v1 + test $2, R8(n) + jz L(mul_2_prologue_3) + + ALIGN(16) +L(mul_2_prologue_1): + lea 0(n), j + mov %rax, w3 + mov %rdx, w0 + xor R32(w1), R32(w1) + mov (up,n,8), %rax + lea L(addmul_outer_3)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_3): + lea 2(n), j + mov $0, R32(w3) + mov %rax, w1 + mov (up,n,8), %rax + mov %rdx, w2 + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_top): + mov -32(up,j,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add %rax, w1 + mov w0, -24(rp,j,8) + adc %rdx, w2 + mov -16(up,j,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) +L(mul_2_entry_3): + mov $0, R32(w0) + mov w1, -16(rp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc R32(w1), R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(rp,j,8) + adc %rdx, w0 + mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) +L(mul_2_entry_1): + add $4, j + mov w3, -32(rp,j,8) + js L(mul_2_top) + + imul -16(up), v1 + add v1, w0 + imul -8(up), v0 + add v0, w0 + mov w0, -8(rp) + + add $2, n + jz L(ret) + + mov 16(vp), v0 + mov 24(vp), v1 + + lea 16(vp), vp + lea 16(rp), rp + + jmp *outer_addr + + +L(addmul_outer_1): + lea -2(n), j + mov -16(up,n,8), %rax + mul v0 + mov %rax, w3 + mov -16(up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + lea L(addmul_outer_3)(%rip), outer_addr + jmp L(addmul_entry_1) + +L(addmul_outer_3): + lea 0(n), j + mov -16(up,n,8), %rax + xor R32(w3), R32(w3) + mul v0 + mov %rax, w1 + mov -16(up,n,8), %rax + mov %rdx, w2 + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(addmul_entry_3) + + ALIGN(16) +L(addmul_top): + add w3, -32(rp,j,8) + adc %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc R32(w2), R32(w2) + mul v1 + xor R32(w3), R32(w3) + add w0, -24(rp,j,8) + adc %rax, w1 + mov -16(up,j,8), %rax + adc %rdx, w2 + mul v0 + add %rax, w1 + mov -16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(addmul_entry_3): + mul v1 + add w1, -16(rp,j,8) + adc %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mul v0 + xor R32(w0), R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov -8(up,j,8), %rax + adc R32(w1), R32(w0) + mul v1 + add w2, -8(rp,j,8) + adc %rax, w3 + adc %rdx, w0 + mov (up,j,8), %rax + mul v0 + add %rax, w3 + mov (up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(addmul_entry_1): + mul v1 + add $4, j + js L(addmul_top) + + add w3, -32(rp) + adc %rax, w0 + + imul -24(up), v0 + add v0, w0 + add w0, -24(rp) + + add $2, n + jns L(ret) + + lea 16(vp), vp + + mov (vp), v0 + mov 8(vp), v1 + + lea -16(up), up + + jmp *outer_addr + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm new file mode 100644 index 0000000..86f1414 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm @@ -0,0 +1,559 @@ +dnl AMD64 mpn_mulmid_basecase + +dnl Contributed by David Harvey. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C K8,K9: 2.375 (2.5 when un - vn is "small") +C K10: ? +C P4: ? +C P6-15: ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp_param',`%rcx') +define(`vn', `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n', `%r11') +define(`outer_addr', `%r14') +define(`un', `%r13') +define(`vp', `%r15') + +define(`vp_inner', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mulmid_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov vp_param, vp + + C use un for row length (= un_param - vn + 1) + lea 1(un_param), un + sub vn, un + + lea (rp,un,8), rp + + cmp $4, un C TODO: needs tuning + jc L(diagonal) + + lea (up,un_param,8), up + + test $1, vn + jz L(mul_2) + +C =========================================================== +C mul_1 for vp[0] if vn is odd + +L(mul_1): + mov R32(un), R32(w0) + + neg un + mov (up,un,8), %rax + mov (vp), v0 + mul v0 + + and $-4, un C round down to multiple of 4 + mov un, n + + and $3, R32(w0) + jz L(mul_1_prologue_0) + cmp $2, R32(w0) + jc L(mul_1_prologue_1) + jz L(mul_1_prologue_2) + +L(mul_1_prologue_3): + mov %rax, w3 + mov %rdx, w0 + lea L(addmul_prologue_3)(%rip), outer_addr + jmp L(mul_1_entry_3) + + ALIGN(16) +L(mul_1_prologue_0): + mov %rax, w2 + mov %rdx, w3 C note already w0 == 0 + lea L(addmul_prologue_0)(%rip), outer_addr + jmp L(mul_1_entry_0) + + ALIGN(16) +L(mul_1_prologue_1): + add $4, n + mov %rax, w1 + mov %rdx, w2 + mov $0, R32(w3) + mov (up,n,8), %rax + lea L(addmul_prologue_1)(%rip), outer_addr + jmp L(mul_1_entry_1) + + ALIGN(16) +L(mul_1_prologue_2): + mov %rax, w0 + mov %rdx, w1 + mov 24(up,n,8), %rax + mov $0, R32(w2) + mov $0, R32(w3) + lea L(addmul_prologue_2)(%rip), outer_addr + jmp L(mul_1_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8 + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,n,8) + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 +L(mul_1_entry_1): + mov $0, R32(w0) + mul v0 + mov w1, -8(rp,n,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,n,8), %rax + mul v0 + mov w2, (rp,n,8) + add %rax, w3 + adc %rdx, w0 +L(mul_1_entry_3): + mov 16(up,n,8), %rax + mul v0 + mov w3, 8(rp,n,8) + mov $0, R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,n,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, n + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + mov w2, 8(rp) C zero last limb of output + adc %rdx, w2 + mov w2, (rp) + + dec vn + jz L(ret) + + lea -8(up), up + lea 8(vp), vp + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C mul_2 for vp[0], vp[1] if vn is even + + ALIGN(16) +L(mul_2): + mov R32(un), R32(w0) + + neg un + mov -8(up,un,8), %rax + mov (vp), v0 + mov 8(vp), v1 + mul v1 + + and $-4, un C round down to multiple of 4 + mov un, n + + and $3, R32(w0) + jz L(mul_2_prologue_0) + cmp $2, R32(w0) + jc L(mul_2_prologue_1) + jz L(mul_2_prologue_2) + +L(mul_2_prologue_3): + mov %rax, w1 + mov %rdx, w2 + lea L(addmul_prologue_3)(%rip), outer_addr + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_prologue_0): + mov %rax, w0 + mov %rdx, w1 + lea L(addmul_prologue_0)(%rip), outer_addr + jmp L(mul_2_entry_0) + + ALIGN(16) +L(mul_2_prologue_1): + mov %rax, w3 + mov %rdx, w0 + mov $0, R32(w1) + lea L(addmul_prologue_1)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_2): + mov %rax, w2 + mov %rdx, w3 + mov $0, R32(w0) + mov 16(up,n,8), %rax + lea L(addmul_prologue_2)(%rip), outer_addr + jmp L(mul_2_entry_2) + + + C this loop is 18 c/loop = 2.25 c/l on K8 + + ALIGN(16) +L(mul_2_top): + mov -8(up,n,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 +L(mul_2_entry_0): + mov $0, R32(w2) + mov (up,n,8), %rax + mul v0 + add %rax, w0 + mov (up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add %rax, w1 + mov w0, (rp,n,8) + adc %rdx, w2 +L(mul_2_entry_3): + mov 8(up,n,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov $0, R32(w0) + adc $0, R32(w3) + mov 8(up,n,8), %rax + mov w1, 8(rp,n,8) + mul v1 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 +L(mul_2_entry_2): + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, 16(rp,n,8) + adc %rdx, w0 +L(mul_2_entry_1): + mov 24(up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, n + mov w3, -8(rp,n,8) + jnz L(mul_2_top) + + mov w0, (rp) + mov w1, 8(rp) + + sub $2, vn + jz L(ret) + + lea 16(vp), vp + lea -16(up), up + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C addmul_2 for remaining vp's + + ALIGN(16) +L(addmul_prologue_0): + mov -8(up,n,8), %rax + mul v1 + mov %rax, w1 + mov %rdx, w2 + mov $0, R32(w3) + jmp L(addmul_entry_0) + + ALIGN(16) +L(addmul_prologue_1): + mov 16(up,n,8), %rax + mul v1 + mov %rax, w0 + mov %rdx, w1 + mov $0, R32(w2) + mov 24(up,n,8), %rax + jmp L(addmul_entry_1) + + ALIGN(16) +L(addmul_prologue_2): + mov 8(up,n,8), %rax + mul v1 + mov %rax, w3 + mov %rdx, w0 + mov $0, R32(w1) + jmp L(addmul_entry_2) + + ALIGN(16) +L(addmul_prologue_3): + mov (up,n,8), %rax + mul v1 + mov %rax, w2 + mov %rdx, w3 + mov $0, R32(w0) + mov $0, R32(w1) + jmp L(addmul_entry_3) + + C this loop is 19 c/loop = 2.375 c/l on K8 + + ALIGN(16) +L(addmul_top): + mov $0, R32(w3) + add %rax, w0 + mov -8(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add w0, -8(rp,n,8) + adc %rax, w1 + adc %rdx, w2 +L(addmul_entry_0): + mov (up,n,8), %rax + mul v0 + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 + add w1, (rp,n,8) + mov $0, R32(w1) + adc %rax, w2 + mov $0, R32(w0) + adc %rdx, w3 +L(addmul_entry_3): + mov 8(up,n,8), %rax + mul v0 + add %rax, w2 + mov 8(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add w2, 8(rp,n,8) + adc %rax, w3 + adc %rdx, w0 +L(addmul_entry_2): + mov 16(up,n,8), %rax + mul v0 + add %rax, w3 + mov 16(up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add w3, 16(rp,n,8) + nop C don't ask... + adc %rax, w0 + mov $0, R32(w2) + mov 24(up,n,8), %rax + adc %rdx, w1 +L(addmul_entry_1): + mul v0 + add $4, n + jnz L(addmul_top) + + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) + + add w0, -8(rp) + adc w1, (rp) + adc w2, 8(rp) + + sub $2, vn + jz L(ret) + + lea 16(vp), vp + lea -16(up), up + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C accumulate along diagonals if un - vn is small + + ALIGN(16) +L(diagonal): + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + xor R32(w2), R32(w2) + + neg un + + mov R32(vn), %eax + and $3, %eax + jz L(diag_prologue_0) + cmp $2, %eax + jc L(diag_prologue_1) + jz L(diag_prologue_2) + +L(diag_prologue_3): + lea -8(vp), vp + mov vp, vp_inner + add $1, vn + mov vn, n + lea L(diag_entry_3)(%rip), outer_addr + jmp L(diag_entry_3) + +L(diag_prologue_0): + mov vp, vp_inner + mov vn, n + lea 0(%rip), outer_addr + mov -8(up,n,8), %rax + jmp L(diag_entry_0) + +L(diag_prologue_1): + lea 8(vp), vp + mov vp, vp_inner + add $3, vn + mov vn, n + lea 0(%rip), outer_addr + mov -8(vp_inner), %rax + jmp L(diag_entry_1) + +L(diag_prologue_2): + lea -16(vp), vp + mov vp, vp_inner + add $2, vn + mov vn, n + lea 0(%rip), outer_addr + mov 16(vp_inner), %rax + jmp L(diag_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8 + + ALIGN(16) +L(diag_top): + add %rax, w0 + adc %rdx, w1 + mov -8(up,n,8), %rax + adc $0, w2 +L(diag_entry_0): + mulq (vp_inner) + add %rax, w0 + adc %rdx, w1 + adc $0, w2 +L(diag_entry_3): + mov -16(up,n,8), %rax + mulq 8(vp_inner) + add %rax, w0 + mov 16(vp_inner), %rax + adc %rdx, w1 + adc $0, w2 +L(diag_entry_2): + mulq -24(up,n,8) + add %rax, w0 + mov 24(vp_inner), %rax + adc %rdx, w1 + lea 32(vp_inner), vp_inner + adc $0, w2 +L(diag_entry_1): + mulq -32(up,n,8) + sub $4, n + jnz L(diag_top) + + add %rax, w0 + adc %rdx, w1 + adc $0, w2 + + mov w0, (rp,un,8) + + inc un + jz L(diag_end) + + mov vn, n + mov vp, vp_inner + + lea 8(up), up + mov w1, w0 + mov w2, w1 + xor R32(w2), R32(w2) + + jmp *outer_addr + +L(diag_end): + mov w1, (rp) + mov w2, 8(rp) + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm b/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm new file mode 100644 index 0000000..9327b21 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm @@ -0,0 +1,591 @@ +dnl X86-64 mpn_redc_1 optimised for AMD K8-K10. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2004, 2008, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * This looks different from other current redc_1.asm variants. Consider +C adapting this to the mainstream style. +C * Is this code really faster than more approaches which compute q0 later? +C Is the use of a jump jump table faster? Or is the edge of this due to the +C inlined add_n code? +C * Put initial m[0] x q0 computation in header. +C * Put basecases at the file's end, single them out before the pushes. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r11') +define(`nneg', `%r12') +define(`mp', `%r13') +define(`q0', `%rbp') +define(`vp', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + mov (up), q0 C up[0] + push %rbx + imul u0inv, q0 C first q0, for all execution paths + push %r12 + push %r13 + push %r14 + push %r15 + + mov n, nneg + neg nneg + lea (mp_param,n,8), mp C mp += n + lea -16(up,n,8), up C up += n + + mov R32(n), R32(%rax) + and $3, R32(%rax) + lea 4(%rax), %r9 + cmp $4, R32(n) + cmovg %r9, %rax + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(0m4), L(tab)) + JMPENT( L(1m4), L(tab)) + JMPENT( L(2m4), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + + ALIGN(16) +L(1): mov (mp_param), %rax + mul q0 + add 8(up), %rax + adc 16(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + + + ALIGN(16) +L(2): mov (mp_param), %rax + mul q0 + xor R32(%r14), R32(%r14) + mov %rax, %r10 + mov -8(mp), %rax + mov %rdx, %r9 + mul q0 + add (up), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 8(up), %r9 + adc $0, %r14 + mov %r9, q0 + imul u0inv, q0 + mov -16(mp), %rax + mul q0 + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov -8(mp), %rax + mov %rdx, %r11 + mul q0 + add %r9, %r10 + adc %rax, %r11 + adc %rdx, %rbx + add 16(up), %r11 + adc $0, %rbx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 24(up), %rbx + mov %r14, (rp) + mov %rbx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + +L(3): mov (mp_param), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add -8(up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add (up), %r10 + mov %r10, (up) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, q0 + imul u0inv, q0 + add %r9, 8(up) + adc $0, %r14 + mov %r14, -8(up) + + mov -24(mp), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add (up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add 8(up), %r10 + mov %r10, 8(up) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, q0 + imul u0inv, q0 + add %r9, 16(up) + adc $0, %r14 + mov %r14, (up) + + mov -24(mp), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add 8(up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add 16(up), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 24(up), %r9 + adc $0, %r14 + + xor R32(%rax), R32(%rax) + add -8(up), %r10 + adc (up), %r9 + adc 32(up), %r14 + mov %r10, (rp) + mov %r9, 8(rp) + mov %r14, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + + ALIGN(16) +L(2m4): +L(lo2): mov (mp,nneg,8), %rax + mul q0 + xor R32(%r14), R32(%r14) + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mov %rdx, %r9 + mul q0 + add 16(up,nneg,8), %r10 + adc %rax, %r9 + mov 16(mp,nneg,8), %rax + adc %rdx, %r14 + mul q0 + mov $0, R32(%r10) C xor? + lea 2(nneg), i + add %r9, %r15 + imul u0inv, %r15 + jmp L(e2) + + ALIGN(16) +L(li2): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 +L(e2): add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li2) + +L(le2): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo2) + + mov nneg, n + sar $2, n + lea 32(up,nneg,8), up + lea (up,nneg,8), vp + + mov -16(up), %r8 + mov -8(up), %r9 + add -16(vp), %r8 + adc -8(vp), %r9 + mov %r8, (rp) + mov %r9, 8(rp) + lea 16(rp), rp + jmp L(addx) + + + ALIGN(16) +L(1m4): +L(lo1): mov (mp,nneg,8), %rax + xor %r9, %r9 + xor R32(%rbx), R32(%rbx) + mul q0 + mov %rax, %r9 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mov %rdx, %r14 + mov $0, R32(%r10) C xor? + mul q0 + add 16(up,nneg,8), %r9 + adc %rax, %r14 + adc %rdx, %rbx + mov 16(mp,nneg,8), %rax + mul q0 + lea 1(nneg), i + add %r14, %r15 + imul u0inv, %r15 + jmp L(e1) + + ALIGN(16) +L(li1): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 +L(e1): add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li1) + +L(le1): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo1) + + mov nneg, n + sar $2, n + lea 24(up,nneg,8), up + lea (up,nneg,8), vp + + mov -8(up), %r8 + add -8(vp), %r8 + mov %r8, (rp) + lea 8(rp), rp + jmp L(addx) + + + ALIGN(16) +L(0): +L(0m4): +L(lo0): mov (mp,nneg,8), %rax + mov nneg, i + mul q0 + xor R32(%r10), R32(%r10) + mov %rax, %r14 + mov %rdx, %rbx + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mul q0 + add 16(up,nneg,8), %r14 + adc %rax, %rbx + adc %rdx, %r10 + add %rbx, %r15 + imul u0inv, %r15 + jmp L(e0) + + ALIGN(16) +L(li0): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 +L(e0): mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li0) + +L(le0): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo0) + + mov nneg, n + sar $2, n + clc + lea 16(up,nneg,8), up + lea (up,nneg,8), vp + jmp L(addy) + + + ALIGN(16) +L(3m4): +L(lo3): mov (mp,nneg,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mul q0 + add 16(up,nneg,8), %rbx C result is zero, might carry + mov $0, R32(%rbx) C zero + mov %rbx, %r14 C zero + adc %rax, %r10 + mov 16(mp,nneg,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + add %r10, %r15 + mul q0 + lea 3(nneg), i + imul u0inv, %r15 +C jmp L(li3) + + ALIGN(16) +L(li3): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li3) + +L(le3): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + mov %r15, q0 + lea 8(up), up + dec n + jnz L(lo3) + + +C ==== Addition code ==== + mov nneg, n + sar $2, n + lea 40(up,nneg,8), up + lea (up,nneg,8), vp + + mov -24(up), %r8 + mov -16(up), %r9 + mov -8(up), %r10 + add -24(vp), %r8 + adc -16(vp), %r9 + adc -8(vp), %r10 + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + lea 24(rp), rp + +L(addx):inc n + jz L(ad3) + +L(addy):mov (up), %r8 + mov 8(up), %r9 + inc n + jmp L(mid) + +C ALIGN(16) +L(al3): adc (vp), %r8 + adc 8(vp), %r9 + adc 16(vp), %r10 + adc 24(vp), %r11 + mov %r8, (rp) + lea 32(up), up + mov %r9, 8(rp) + mov %r10, 16(rp) + inc n + mov %r11, 24(rp) + lea 32(vp), vp + mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp +L(mid): mov 16(up), %r10 + mov 24(up), %r11 + jnz L(al3) + +L(ae3): adc (vp), %r8 + adc 8(vp), %r9 + adc 16(vp), %r10 + adc 24(vp), %r11 + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + +L(ad3): mov R32(n), R32(%rax) C zero + adc R32(%rax), R32(%rax) + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm new file mode 100644 index 0000000..60cf945 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm @@ -0,0 +1,807 @@ +dnl AMD64 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C NOTES +C * There is a major stupidity in that we call mpn_mul_1 initially, for a +C large trip count. Instead, we should follow the generic/sqr_basecase.c +C code which uses addmul_2s from the start, conditionally leaving a 1x1 +C multiply to the end. (In assembly code, one would stop invoking +C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.) +C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to +C save/restore carry, instead it can propagate into the high product word. +C * Align more labels, should shave off a few cycles. +C * We can safely use 32-bit size operations, since operands with (2^32) +C limbs will lead to non-termination in practice. +C * The jump table could probably be optimized, at least for non-pic. +C * The special code for n <= 4 was quickly written. It is probably too +C large and unnecessarily slow. +C * Consider combining small cases code so that the n=k-1 code jumps into the +C middle of the n=k code. +C * Avoid saving registers for small cases code. +C * Needed variables: +C n r11 input size +C i r8 work left, initially n +C j r9 inner loop count +C r15 unused +C v0 r13 +C v1 r14 +C rp rdi +C up rsi +C w0 rbx +C w1 rcx +C w2 rbp +C w3 r10 +C tp r12 +C lo rax +C hi rdx +C rsp + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') + +define(`n', `%r11') +define(`tp', `%r12') +define(`i', `%r8') +define(`j', `%r9') +define(`v0', `%r13') +define(`v1', `%r14') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + mov R32(n_param), R32(%rcx) + mov R32(n_param), R32(n) C free original n register (rdx) + + add $-40, %rsp + + and $3, R32(%rcx) + cmp $4, R32(n_param) + lea 4(%rcx), %r8 + + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) + + cmovg %r8, %rcx + + lea L(tab)(%rip), %rax +ifdef(`PIC', +` movslq (%rax,%rcx,4), %r10 + add %r10, %rax + jmp *%rax +',` + jmp *(%rax,%rcx,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(4), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(0m4), L(tab)) + JMPENT( L(1m4), L(tab)) + JMPENT( L(2m4), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + +L(1): mov (up), %rax + mul %rax + add $40, %rsp + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(2): mov (up), %rax + mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + add $40, %rsp + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(3): mov (up), %rax + mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add $40, %rsp + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret + +L(4): mov (up), %rax + mov %rax, %r11 + mul %rax + mov 8(up), %rbx + mov %rax, (rp) + mov %rbx, %rax + mov %rdx, 8(rp) + mul %rax + mov %rax, 16(rp) + mov %rdx, 24(rp) + mov 16(up), %rax + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + mov 24(up), %rax + mul %rax + mov %rax, 48(rp) + mov %rbx, %rax + mov %rdx, 56(rp) + + mul %r11 + add $32, %rsp + mov %rax, %r8 + mov %rdx, %r9 + mov 16(up), %rax + mul %r11 + xor %r10, %r10 + add %rax, %r9 + adc %rdx, %r10 + mov 24(up), %rax + mul %r11 + xor %r11, %r11 + add %rax, %r10 + adc %rdx, %r11 + mov 16(up), %rax + mul %rbx + xor %rcx, %rcx + add %rax, %r10 + adc %rdx, %r11 + adc $0, %rcx + mov 24(up), %rax + mul %rbx + pop %rbx + add %rax, %r11 + adc %rdx, %rcx + mov 16(up), %rdx + mov 24(up), %rax + mul %rdx + add %rax, %rcx + adc $0, %rdx + + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rcx, %rcx + mov $0, R32(%rax) + adc %rdx, %rdx + + adc %rax, %rax + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %r11, 32(rp) + adc %rcx, 40(rp) + adc %rdx, 48(rp) + adc %rax, 56(rp) + FUNC_EXIT() + ret + + +L(0m4): + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax + lea (up,n,8), up C point up at end of input operand + + lea -4(n), i +C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1]) + xor R32(j), R32(j) + sub n, j + + mul v0 + xor R32(w2), R32(w2) + mov %rax, w0 + mov 16(up,j,8), %rax + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(mul_1_m3_top): + add %rax, w2 + mov w3, (tp,j,8) + mov (up,j,8), %rax + adc %rdx, w1 + xor R32(w0), R32(w0) + mul v0 + xor R32(w3), R32(w3) + mov w2, 8(tp,j,8) + add %rax, w1 + adc %rdx, w0 + mov 8(up,j,8), %rax + mov w1, 16(tp,j,8) + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov 16(up,j,8), %rax + adc %rdx, w3 +L(L3): xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov 24(up,j,8), %rax + adc %rdx, w2 + mov w0, 24(tp,j,8) + mul v0 + add $4, j + js L(mul_1_m3_top) + + add %rax, w2 + mov w3, (tp) + adc %rdx, w1 + mov w2, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + lea -8(up), up + jmp L(dowhile) + + +L(1m4): + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 + lea 8(up,n,8), up C point up at end of input operand + + lea -3(n), i +C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1) + lea -3(n), j + neg j + + mov %rax, v1 C u1 + mul v0 C u0 * u1 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov %rax, 8(rp) + jmp L(m0) + + ALIGN(16) +L(mul_2_m0_top): + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add %rax, w1 + mov w0, -24(tp,j,8) + adc %rdx, w2 +L(m0): mov -16(up,j,8), %rax C u2, u6 ... + mul v0 C u0 * u2 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(tp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(tp,j,8) + adc %rdx, w0 +L(m2x): mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, j + mov -32(up,j,8), %rax + mov w3, -32(tp,j,8) + js L(mul_2_m0_top) + + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, -8(tp) + mov w1, (tp) + + lea -16(up), up + lea eval(3*8-24)(tp), tp C tp += 3 + jmp L(dowhile_end) + + +L(2m4): + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax + lea (up,n,8), up C point up at end of input operand + + lea -4(n), i +C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i]) + lea -2(n), j + neg j + + mul v0 + mov %rax, w2 + mov (up,j,8), %rax + mov %rdx, w1 + jmp L(L1) + + ALIGN(16) +L(mul_1_m1_top): + add %rax, w2 + mov w3, (tp,j,8) + mov (up,j,8), %rax + adc %rdx, w1 +L(L1): xor R32(w0), R32(w0) + mul v0 + xor R32(w3), R32(w3) + mov w2, 8(tp,j,8) + add %rax, w1 + adc %rdx, w0 + mov 8(up,j,8), %rax + mov w1, 16(tp,j,8) + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov 16(up,j,8), %rax + adc %rdx, w3 + xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov 24(up,j,8), %rax + adc %rdx, w2 + mov w0, 24(tp,j,8) + mul v0 + add $4, j + js L(mul_1_m1_top) + + add %rax, w2 + mov w3, (tp) + adc %rdx, w1 + mov w2, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + lea -8(up), up + jmp L(dowhile_mid) + + +L(3m4): + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 + lea 8(up,n,8), up C point up at end of input operand + + lea -5(n), i +C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i) + lea -1(n), j + neg j + + mov %rax, v1 C u1 + mul v0 C u0 * u1 + mov %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov %rax, 8(rp) + jmp L(m2) + + ALIGN(16) +L(mul_2_m2_top): + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add %rax, w1 + mov w0, -24(tp,j,8) + adc %rdx, w2 + mov -16(up,j,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(tp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(tp,j,8) + adc %rdx, w0 +L(m2): mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, j + mov -32(up,j,8), %rax + mov w3, -32(tp,j,8) + js L(mul_2_m2_top) + + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, -8(tp) + mov w1, (tp) + + lea -16(up), up + jmp L(dowhile_mid) + +L(dowhile): +C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i) + lea 4(i), j + neg j + + mov 16(up,j,8), v0 + mov 24(up,j,8), v1 + mov 24(up,j,8), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, 24(tp,j,8) + adc %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + jmp L(am2) + + ALIGN(16) +L(addmul_2_m2_top): + add w3, (tp,j,8) + adc %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add w0, 8(tp,j,8) + adc %rax, w1 + adc %rdx, w2 + mov 16(up,j,8), %rax + mov $0, R32(w3) + mul v0 C v0 * u1 + add %rax, w1 + mov 16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 C v1 * u1 + add w1, 16(tp,j,8) + adc %rax, w2 + mov 24(up,j,8), %rax + adc %rdx, w3 + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov 24(up,j,8), %rax + adc $0, R32(w0) + mul v1 + add w2, 24(tp,j,8) + adc %rax, w3 + adc %rdx, w0 +L(am2): mov 32(up,j,8), %rax + mul v0 + add %rax, w3 + mov 32(up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add $4, j + js L(addmul_2_m2_top) + + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + + add $-2, R32(i) C i -= 2 + +L(dowhile_mid): +C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i) + lea 2(i), j + neg j + + mov (up,j,8), v0 + mov 8(up,j,8), v1 + mov 8(up,j,8), %rax + mul v0 + xor R32(w1), R32(w1) + add %rax, 8(tp,j,8) + adc %rdx, w1 + xor R32(w2), R32(w2) + jmp L(20) + + ALIGN(16) +L(addmul_2_m0_top): + add w3, (tp,j,8) + adc %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add w0, 8(tp,j,8) + adc %rax, w1 + adc %rdx, w2 +L(20): mov 16(up,j,8), %rax + mov $0, R32(w3) + mul v0 C v0 * u1 + add %rax, w1 + mov 16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 C v1 * u1 + add w1, 16(tp,j,8) + adc %rax, w2 + mov 24(up,j,8), %rax + adc %rdx, w3 + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov 24(up,j,8), %rax + adc $0, R32(w0) + mul v1 + add w2, 24(tp,j,8) + adc %rax, w3 + adc %rdx, w0 + mov 32(up,j,8), %rax + mul v0 + add %rax, w3 + mov 32(up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add $4, j + js L(addmul_2_m0_top) + + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 +L(dowhile_end): + + add $-2, R32(i) C i -= 2 + jne L(dowhile) + +C Function mpn_addmul_2s_2 + mov -16(up), v0 + mov -8(up), v1 + mov -8(up), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, -8(tp) + adc %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov (up), %rax + mul v0 + add %rax, w3 + mov (up), %rax + adc %rdx, w0 + mul v1 + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + +C Function mpn_sqr_diag_addlsh1 + lea -4(n,n), j + + mov 8(rp), %r11 + lea -8(up), up + lea (rp,j,8), rp + neg j + mov (up,j,4), %rax + mul %rax + test $2, R8(j) + jnz L(odd) + +L(evn): add %r11, %r11 + sbb R32(%rbx), R32(%rbx) C save CF + add %rdx, %r11 + mov %rax, (rp,j,8) + jmp L(d0) + +L(odd): add %r11, %r11 + sbb R32(%rbp), R32(%rbp) C save CF + add %rdx, %r11 + mov %rax, (rp,j,8) + lea -2(j), j + jmp L(d1) + + ALIGN(16) +L(top): mov (up,j,4), %rax + mul %rax + add R32(%rbp), R32(%rbp) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (rp,j,8) +L(d0): mov %r11, 8(rp,j,8) + mov 16(rp,j,8), %r10 + adc %r10, %r10 + mov 24(rp,j,8), %r11 + adc %r11, %r11 + nop + sbb R32(%rbp), R32(%rbp) C save CF + mov 8(up,j,4), %rax + mul %rax + add R32(%rbx), R32(%rbx) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, 16(rp,j,8) +L(d1): mov %r11, 24(rp,j,8) + mov 32(rp,j,8), %r10 + adc %r10, %r10 + mov 40(rp,j,8), %r11 + adc %r11, %r11 + sbb R32(%rbx), R32(%rbx) C save CF + add $4, j + js L(top) + + mov (up), %rax + mul %rax + add R32(%rbp), R32(%rbp) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (rp) + mov %r11, 8(rp) + mov 16(rp), %r10 + adc %r10, %r10 + sbb R32(%rbp), R32(%rbp) C save CF + neg R32(%rbp) + mov 8(up), %rax + mul %rax + add R32(%rbx), R32(%rbx) C restore carry + adc %rax, %r10 + adc %rbp, %rdx + mov %r10, 16(rp) + mov %rdx, 24(rp) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/logops_n.asm b/gmp-6.3.0/mpn/x86_64/logops_n.asm new file mode 100644 index 0000000..e25854d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/logops_n.asm @@ -0,0 +1,260 @@ +dnl AMD64 logops. + +dnl Copyright 2004-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l c/l c/l good +C var-1 var-2 var-3 for cpu? +C AMD K8,K9 1.5 1.5 1.5 y +C AMD K10 1.5 1.5 1.5 y +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD bt1 2.67 ~2.79 ~2.67 +C AMD bt2 2.0 2.28 2.28 y +C AMD zen 1.5 1.5 1.5 = +C Intel P4 2.8 3.35 3.6 +C Intel PNR 2.0 2.0 2.0 = +C Intel NHM 2.0 2.0 2.0 = +C Intel SBR 1.5 1.75 1.75 n +C Intel IBR 1.48 1.71 1.72 n +C Intel HWL 1.5 1.5 1.5 n +C Intel BWL 1.5 1.5 1.5 n +C Intel SKL 1.5 1.5 1.5 n +C Intel atom 3.82 3.82 3.82 n +C Intel SLM 3.0 3.0 3.0 = +C VIA nano 3.25 + +ifdef(`OPERATION_and_n',` + define(`func',`mpn_and_n') + define(`VARIANT_1') + define(`LOGOP',`and')') +ifdef(`OPERATION_andn_n',` + define(`func',`mpn_andn_n') + define(`VARIANT_2') + define(`LOGOP',`and')') +ifdef(`OPERATION_nand_n',` + define(`func',`mpn_nand_n') + define(`VARIANT_3') + define(`LOGOP',`and')') +ifdef(`OPERATION_ior_n',` + define(`func',`mpn_ior_n') + define(`VARIANT_1') + define(`LOGOP',`or')') +ifdef(`OPERATION_iorn_n',` + define(`func',`mpn_iorn_n') + define(`VARIANT_2') + define(`LOGOP',`or')') +ifdef(`OPERATION_nior_n',` + define(`func',`mpn_nior_n') + define(`VARIANT_3') + define(`LOGOP',`or')') +ifdef(`OPERATION_xor_n',` + define(`func',`mpn_xor_n') + define(`VARIANT_1') + define(`LOGOP',`xor')') +ifdef(`OPERATION_xnor_n',` + define(`func',`mpn_xnor_n') + define(`VARIANT_2') + define(`LOGOP',`xor')') + + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + +ifdef(`VARIANT_1',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + neg n + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up,n,8), %r8 + mov %r8, (rp,n,8) + dec n + jmp L(e11) +L(b10): add $-2, n + jmp L(e10) +L(b01): LOGOP (up,n,8), %r8 + mov %r8, (rp,n,8) + inc n + jz L(ret) + +L(top): mov (vp,n,8), %r8 +L(b00): mov 8(vp,n,8), %r9 + LOGOP (up,n,8), %r8 + LOGOP 8(up,n,8), %r9 + nop C K8/K9/K10 concession + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) +L(e11): mov 16(vp,n,8), %r8 +L(e10): mov 24(vp,n,8), %r9 + LOGOP 16(up,n,8), %r8 + LOGOP 24(up,n,8), %r9 + mov %r8, 16(rp,n,8) + mov %r9, 24(rp,n,8) + add $4, n + jnc L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_2',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + not %r8 + mov R32(%rcx), R32(%rax) + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + neg n + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up,n,8), %r8 + mov %r8, (rp,n,8) + dec n + jmp L(e11) +L(b10): add $-2, n + jmp L(e10) + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +L(b01): LOGOP (up,n,8), %r8 + mov %r8, (rp,n,8) + inc n + jz L(ret) + +L(top): mov (vp,n,8), %r8 + not %r8 +L(b00): mov 8(vp,n,8), %r9 + not %r9 + LOGOP (up,n,8), %r8 + LOGOP 8(up,n,8), %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) +L(e11): mov 16(vp,n,8), %r8 + not %r8 +L(e10): mov 24(vp,n,8), %r9 + not %r9 + LOGOP 16(up,n,8), %r8 + LOGOP 24(up,n,8), %r9 + mov %r8, 16(rp,n,8) + mov %r9, 24(rp,n,8) + add $4, n + jnc L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_3',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + lea (vp,n,8), vp + lea (up,n,8), up + lea (rp,n,8), rp + neg n + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up,n,8), %r8 + not %r8 + mov %r8, (rp,n,8) + dec n + jmp L(e11) +L(b10): add $-2, n + jmp L(e10) + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +L(b01): LOGOP (up,n,8), %r8 + not %r8 + mov %r8, (rp,n,8) + inc n + jz L(ret) + +L(top): mov (vp,n,8), %r8 +L(b00): mov 8(vp,n,8), %r9 + LOGOP (up,n,8), %r8 + not %r8 + LOGOP 8(up,n,8), %r9 + not %r9 + mov %r8, (rp,n,8) + mov %r9, 8(rp,n,8) +L(e11): mov 16(vp,n,8), %r8 +L(e10): mov 24(vp,n,8), %r9 + LOGOP 16(up,n,8), %r8 + not %r8 + LOGOP 24(up,n,8), %r9 + not %r9 + mov %r8, 16(rp,n,8) + mov %r9, 24(rp,n,8) + add $4, n + jnc L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') diff --git a/gmp-6.3.0/mpn/x86_64/lshift.asm b/gmp-6.3.0/mpn/x86_64/lshift.asm new file mode 100644 index 0000000..fff3152 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/lshift.asm @@ -0,0 +1,172 @@ +dnl AMD64 mpn_lshift -- mpn left shift. + +dnl Copyright 2003, 2005, 2007, 2009, 2011, 2012, 2018 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cnt=1 +C AMD K8,K9 2.375 1.375 +C AMD K10 2.375 1.375 +C Intel P4 8 10.5 +C Intel core2 2.11 4.28 +C Intel corei ? ? +C Intel atom 5.75 3.5 +C VIA nano 3.5 2.25 + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + neg R32(%rcx) C put rsh count in cl + mov -8(up,n,8), %rax + shr R8(%rcx), %rax C function return value + + neg R32(%rcx) C put lsh count in cl + lea 1(n), R32(%r8) + and $3, R32(%r8) + je L(rlx) C jump for n = 3, 7, 11, ... + + dec R32(%r8) + jne L(1) +C n = 4, 8, 12, ... + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + mov %r10, -8(rp,n,8) + dec n + jmp L(rll) + +L(1): dec R32(%r8) + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + mov %r10, -8(rp,n,8) + dec n + neg R32(%rcx) C put lsh count in cl +L(1x): + cmp $1, n + je L(ast) + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r11 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + mov -24(up,n,8), %r9 + shr R8(%rcx), %r8 + or %r8, %r10 + shr R8(%rcx), %r9 + or %r9, %r11 + mov %r10, -8(rp,n,8) + mov %r11, -16(rp,n,8) + sub $2, n + +L(rll): neg R32(%rcx) C put lsh count in cl +L(rlx): mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r11 + + sub $4, n C 4 + jb L(end) C 2 + ALIGN(16) +L(top): + C finish stuff from lsh block + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + mov 8(up,n,8), %r9 + shr R8(%rcx), %r8 + or %r8, %r10 + shr R8(%rcx), %r9 + or %r9, %r11 + mov %r10, 24(rp,n,8) + mov %r11, 16(rp,n,8) + C start two new rsh + mov 0(up,n,8), %r8 + mov -8(up,n,8), %r9 + shr R8(%rcx), %r8 + shr R8(%rcx), %r9 + + C finish stuff from rsh block + neg R32(%rcx) C put lsh count in cl + mov 8(up,n,8), %r10 + mov 0(up,n,8), %r11 + shl R8(%rcx), %r10 + or %r10, %r8 + shl R8(%rcx), %r11 + or %r11, %r9 + mov %r8, 8(rp,n,8) + mov %r9, 0(rp,n,8) + C start two new lsh + mov -8(up,n,8), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r10 + shl R8(%rcx), %r11 + + sub $4, n + jae L(top) C 2 +L(end): + neg R32(%rcx) C put rsh count in cl + mov 8(up), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + mov (up), %r9 + shr R8(%rcx), %r9 + or %r9, %r11 + mov %r10, 16(rp) + mov %r11, 8(rp) + + neg R32(%rcx) C put lsh count in cl +L(ast): mov (up), %r10 + shl R8(%rcx), %r10 + mov %r10, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/lshiftc.asm new file mode 100644 index 0000000..c4ba04a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/lshiftc.asm @@ -0,0 +1,182 @@ +dnl AMD64 mpn_lshiftc -- mpn left shift with complement. + +dnl Copyright 2003, 2005, 2006, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.75 +C AMD K10 2.75 +C Intel P4 ? +C Intel core2 ? +C Intel corei ? +C Intel atom ? +C VIA nano 3.75 + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + neg R32(%rcx) C put rsh count in cl + mov -8(up,n,8), %rax + shr R8(%rcx), %rax C function return value + + neg R32(%rcx) C put lsh count in cl + lea 1(n), R32(%r8) + and $3, R32(%r8) + je L(rlx) C jump for n = 3, 7, 11, ... + + dec R32(%r8) + jne L(1) +C n = 4, 8, 12, ... + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(rp,n,8) + dec n + jmp L(rll) + +L(1): dec R32(%r8) + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(rp,n,8) + dec n + neg R32(%rcx) C put lsh count in cl +L(1x): + cmp $1, n + je L(ast) + mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r11 + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + mov -24(up,n,8), %r9 + shr R8(%rcx), %r8 + or %r8, %r10 + shr R8(%rcx), %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, -8(rp,n,8) + mov %r11, -16(rp,n,8) + sub $2, n + +L(rll): neg R32(%rcx) C put lsh count in cl +L(rlx): mov -8(up,n,8), %r10 + shl R8(%rcx), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r11 + + sub $4, n C 4 + jb L(end) C 2 + ALIGN(16) +L(top): + C finish stuff from lsh block + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + mov 8(up,n,8), %r9 + shr R8(%rcx), %r8 + or %r8, %r10 + shr R8(%rcx), %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 24(rp,n,8) + mov %r11, 16(rp,n,8) + C start two new rsh + mov 0(up,n,8), %r8 + mov -8(up,n,8), %r9 + shr R8(%rcx), %r8 + shr R8(%rcx), %r9 + + C finish stuff from rsh block + neg R32(%rcx) C put lsh count in cl + mov 8(up,n,8), %r10 + mov 0(up,n,8), %r11 + shl R8(%rcx), %r10 + or %r10, %r8 + shl R8(%rcx), %r11 + or %r11, %r9 + not %r8 + not %r9 + mov %r8, 8(rp,n,8) + mov %r9, 0(rp,n,8) + C start two new lsh + mov -8(up,n,8), %r10 + mov -16(up,n,8), %r11 + shl R8(%rcx), %r10 + shl R8(%rcx), %r11 + + sub $4, n + jae L(top) C 2 +L(end): + neg R32(%rcx) C put rsh count in cl + mov 8(up), %r8 + shr R8(%rcx), %r8 + or %r8, %r10 + mov (up), %r9 + shr R8(%rcx), %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 16(rp) + mov %r11, 8(rp) + + neg R32(%rcx) C put lsh count in cl +L(ast): mov (up), %r10 + shl R8(%rcx), %r10 + not %r10 + mov %r10, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/lshsub_n.asm b/gmp-6.3.0/mpn/x86_64/lshsub_n.asm new file mode 100644 index 0000000..4d428c0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/lshsub_n.asm @@ -0,0 +1,172 @@ +dnl AMD64 mpn_lshsub_n. R = 2^k(U - V). + +dnl Copyright 2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) +C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l) +C Intel P4 16.5 +C Intel core2 4.35 +C Intel corei ? +C Intel atom ? +C VIA nano ? + +C This was written quickly and not optimized at all, but it runs very well on +C K8. But perhaps one could get under 3 c/l. Ideas: +C 1) Use indexing to save the 3 LEA +C 2) Write reasonable feed-in code +C 3) Be more clever about register usage +C 4) Unroll more, handling CL negation, carry save/restore cost much now +C 5) Reschedule + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshsub_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + push %r12 + push %r13 + push %r14 + push %r15 + push %rbx + + mov n, %rax + xor R32(%rbx), R32(%rbx) C clear carry save register + mov R32(%r8), R32(%rcx) C shift count + xor R32(%r15), R32(%r15) C limb carry + + mov R32(%rax), R32(%r11) + and $3, R32(%r11) + je L(4) + sub $1, R32(%r11) + +L(oopette): + add R32(%rbx), R32(%rbx) C restore carry flag + mov 0(up), %r8 + lea 8(up), up + sbb 0(vp), %r8 + mov %r8, %r12 + sbb R32(%rbx), R32(%rbx) C save carry flag + shl R8(%rcx), %r8 + or %r15, %r8 + mov %r12, %r15 + lea 8(vp), vp + neg R8(%rcx) + shr R8(%rcx), %r15 + neg R8(%rcx) + mov %r8, 0(rp) + lea 8(rp), rp + sub $1, R32(%r11) + jnc L(oopette) + +L(4): + sub $4, %rax + jc L(end) + + ALIGN(16) +L(oop): + add R32(%rbx), R32(%rbx) C restore carry flag + + mov 0(up), %r8 + mov 8(up), %r9 + mov 16(up), %r10 + mov 24(up), %r11 + + lea 32(up), up + + sbb 0(vp), %r8 + mov %r8, %r12 + sbb 8(vp), %r9 + mov %r9, %r13 + sbb 16(vp), %r10 + mov %r10, %r14 + sbb 24(vp), %r11 + + sbb R32(%rbx), R32(%rbx) C save carry flag + + shl R8(%rcx), %r8 + shl R8(%rcx), %r9 + shl R8(%rcx), %r10 + or %r15, %r8 + mov %r11, %r15 + shl R8(%rcx), %r11 + + lea 32(vp), vp + + neg R8(%rcx) + + shr R8(%rcx), %r12 + shr R8(%rcx), %r13 + shr R8(%rcx), %r14 + shr R8(%rcx), %r15 C used next loop + + or %r12, %r9 + or %r13, %r10 + or %r14, %r11 + + neg R8(%rcx) + + mov %r8, 0(rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + + lea 32(rp), rp + + sub $4, %rax + jnc L(oop) +L(end): + neg R32(%rbx) + shl R8(%rcx), %rbx + adc %r15, %rbx + mov %rbx, %rax + pop %rbx + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/missing-call.m4 b/gmp-6.3.0/mpn/x86_64/missing-call.m4 new file mode 100644 index 0000000..c024f0e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/missing-call.m4 @@ -0,0 +1,53 @@ +dnl AMD64 MULX/ADX simulation support, function call version. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +define(`adox',` + push $1 + push $2 + call __gmp_adox + pop $2 +') + +define(`adcx',` + push $1 + push $2 + call __gmp_adcx + pop $2 +') + +define(`mulx',` + push $1 + call __gmp_mulx + pop $2 + pop $3 +') diff --git a/gmp-6.3.0/mpn/x86_64/missing-inline.m4 b/gmp-6.3.0/mpn/x86_64/missing-inline.m4 new file mode 100644 index 0000000..bd1df13 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/missing-inline.m4 @@ -0,0 +1,100 @@ +dnl AMD64 MULX/ADX simulation support, inline version. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +define(`adox',` + push $2 + push %rcx + push %rbx + push %rax + mov $1, %rcx + pushfq + pushfq +C copy 0(%rsp):11 to 0(%rsp):0 + mov (%rsp), %rbx + shr %rbx + bt $`'10, %rbx + adc %rbx, %rbx + mov %rbx, (%rsp) +C put manipulated flags into eflags, execute a plain adc + popfq + adc %rcx, 32(%rsp) +C copy CF to 0(%rsp):11 + mov (%rsp), %rbx + sbb R32(%rax), R32(%rax) + and $`'0x800, R32(%rax) + and $`'0xfffffffffffff7ff, %rbx + or %rax, %rbx + mov %rbx, (%rsp) +C put manipulated flags into eflags + popfq + pop %rax + pop %rbx + pop %rcx + pop $2 +') + +define(`adcx',` + push $2 + push %rcx + push %rbx + push %rax + mov $1, %rcx + pushfq + adc %rcx, 32(%rsp) + mov (%rsp), %rbx + sbb R32(%rax), R32(%rax) + and $`'0xfffffffffffffffe, %rbx + sub %rax, %rbx + mov %rbx, (%rsp) + popfq + pop %rax + pop %rbx + pop %rcx + pop $2 +') + +define(`mulx',` + lea -16(%rsp), %rsp + push %rax + push %rdx + pushfq C preserve all flags + mov $1, %rax + mul %rdx + mov %rax, 24(%rsp) + mov %rdx, 32(%rsp) + popfq C restore eflags + pop %rdx + pop %rax + pop $2 + pop $3 +') diff --git a/gmp-6.3.0/mpn/x86_64/missing.asm b/gmp-6.3.0/mpn/x86_64/missing.asm new file mode 100644 index 0000000..9b65c89 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/missing.asm @@ -0,0 +1,130 @@ + + dnl AMD64 MULX/ADX simulation support. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ASM_START() + +C Fake the MULX instruction +C +C Accept the single explicit parameter on the stack, return the two result +C words on the stack. This calling convention means that we need to move the +C return address up. +C +PROLOGUE(__gmp_mulx) + lea -8(%rsp), %rsp + push %rax + push %rdx + pushfq C preserve all flags + mov 32(%rsp), %rax C move retaddr... + mov %rax, 24(%rsp) C ...up the stack + mov 40(%rsp), %rax C input parameter + mul %rdx + mov %rax, 32(%rsp) + mov %rdx, 40(%rsp) + popfq C restore eflags + pop %rdx + pop %rax + ret +EPILOGUE() +PROTECT(__gmp_mulx) + + +C Fake the ADOX instruction +C +C Accept the two parameters on the stack, return the result word on the stack. +C This calling convention means that we need to move the return address down. +C +PROLOGUE(__gmp_adox) + push %rcx + push %rbx + push %rax + mov 32(%rsp), %rcx C src2 + mov 24(%rsp), %rax C move retaddr... + mov %rax, 32(%rsp) C ...down the stack + pushfq +C copy 0(%rsp):11 to 0(%rsp):0 + mov (%rsp), %rbx + shr %rbx + bt $10, %rbx + adc %rbx, %rbx + push %rbx +C put manipulated flags into eflags, execute a plain adc + popfq + adc %rcx, 48(%rsp) +C copy CF to 0(%rsp):11 + pop %rbx + sbb R32(%rax), R32(%rax) + and $0x800, R32(%rax) + and $0xfffffffffffff7ff, %rbx + or %rax, %rbx + push %rbx +C put manipulated flags into eflags + popfq + pop %rax + pop %rbx + pop %rcx + lea 8(%rsp), %rsp + ret +EPILOGUE() +PROTECT(__gmp_adox) + + +C Fake the ADCX instruction +C +C Accept the two parameters on the stack, return the result word on the stack. +C This calling convention means that we need to move the return address down. +C +PROLOGUE(__gmp_adcx) + push %rcx + push %rbx + push %rax + mov 32(%rsp), %rcx C src2 + mov 24(%rsp), %rax C move retaddr... + mov %rax, 32(%rsp) C ...down the stack + pushfq + adc %rcx, 48(%rsp) + pop %rbx + sbb R32(%rax), R32(%rax) + and $`'0xfffffffffffffffe, %rbx + sub %rax, %rbx + push %rbx + popfq + pop %rax + pop %rbx + pop %rcx + lea 8(%rsp), %rsp + ret +EPILOGUE() +PROTECT(__gmp_adcx) diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_1.asm b/gmp-6.3.0/mpn/x86_64/mod_1_1.asm new file mode 100644 index 0000000..255305f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_1_1.asm @@ -0,0 +1,238 @@ +dnl AMD64 mpn_mod_1_1p + +dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller. + +dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 6 +C AMD K10 6 +C Intel P4 26 +C Intel core2 12.5 +C Intel NHM 11.3 +C Intel SBR 8.4 (slowdown, old code took 8.0) +C Intel atom 26 +C VIA nano 13 + +define(`B2mb', `%r10') +define(`B2modb', `%r11') +define(`ap', `%rdi') +define(`n', `%rsi') +define(`pre', `%r8') +define(`b', `%rbx') + +define(`r0', `%rbp') C r1 kept in %rax +define(`r2', `%rcx') C kept negated. Also used as shift count +define(`t0', `%r9') + +C mp_limb_t +C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4]) +C %rdi %rsi %rdx %rcx +C The pre array contains bi, cnt, B1modb, B2modb +C Note: This implementation needs B1modb only when cnt > 0 + +C The iteration is almost as follows, +C +C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u +C +C where r2 is a single bit represented as a mask. But to make sure that the +C result fits in two limbs and a bit, carry from the addition +C +C r_0 + r_2 B2mod +C +C is handled specially. On carry, we subtract b to cancel the carry, +C and we use instead the value +C +C r_0 + B2mb (mod B) +C +C This addition can be issued early since it doesn't depend on r2, and it is +C the source of the cmov in the loop. +C +C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1_1p) + FUNC_ENTRY(4) + push %rbp + push %rbx + mov %rdx, b + mov %rcx, pre + + mov -8(ap, n, 8), %rax + cmp $3, n + jnc L(first) + mov -16(ap, n, 8), r0 + jmp L(reduce_two) + +L(first): + C First iteration, no r2 + mov 24(pre), B2modb + mul B2modb + mov -24(ap, n, 8), r0 + add %rax, r0 + mov -16(ap, n, 8), %rax + adc %rdx, %rax + sbb r2, r2 + sub $4, n + jc L(reduce_three) + + mov B2modb, B2mb + sub b, B2mb + + ALIGN(16) +L(top): and B2modb, r2 + lea (B2mb, r0), t0 + mul B2modb + add r0, r2 + mov (ap, n, 8), r0 + cmovc t0, r2 + add %rax, r0 + mov r2, %rax + adc %rdx, %rax + sbb r2, r2 + sub $1, n + jnc L(top) + +L(reduce_three): + C Eliminate r2 + and b, r2 + sub r2, %rax + +L(reduce_two): + mov 8(pre), R32(%rcx) + test R32(%rcx), R32(%rcx) + jz L(normalized) + + C Unnormalized, use B1modb to reduce to size < B (b+1) + mulq 16(pre) + xor t0, t0 + add %rax, r0 + adc %rdx, t0 + mov t0, %rax + + C Left-shift to normalize +ifdef(`SHLD_SLOW',` + shl R8(%rcx), %rax + mov r0, t0 + neg R32(%rcx) + shr R8(%rcx), t0 + or t0, %rax + neg R32(%rcx) +',` + shld R8(%rcx), r0, %rax +') + shl R8(%rcx), r0 + jmp L(udiv) + +L(normalized): + mov %rax, t0 + sub b, t0 + cmovnc t0, %rax + +L(udiv): + lea 1(%rax), t0 + mulq (pre) + add r0, %rax + adc t0, %rdx + imul b, %rdx + sub %rdx, r0 + cmp r0, %rax + lea (b, r0), %rax + cmovnc r0, %rax + cmp b, %rax + jnc L(fix) +L(ok): shr R8(%rcx), %rax + + pop %rbx + pop %rbp + FUNC_EXIT() + ret +L(fix): sub b, %rax + jmp L(ok) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1_1p_cps) + FUNC_ENTRY(2) + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, R32(%rcx) + mov %rsi, %r12 + mov R32(%rcx), R32(%rbp) + sal R8(%rcx), %r12 +IFSTD(` mov %r12, %rdi ') C pass parameter +IFDOS(` mov %r12, %rcx ') C pass parameter +IFDOS(` sub $32, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + neg %r12 + mov %r12, %r8 + mov %rax, (%rbx) C store bi + mov %rbp, 8(%rbx) C store cnt + imul %rax, %r12 + mov %r12, 24(%rbx) C store B2modb + mov R32(%rbp), R32(%rcx) + test R32(%rcx), R32(%rcx) + jz L(z) + + mov $1, R32(%rdx) +ifdef(`SHLD_SLOW',` + C Destroys %rax, unlike shld. Otherwise, we could do B1modb + C before B2modb, and get rid of the move %r12, %r8 above. + + shl R8(%rcx), %rdx + neg R32(%rcx) + shr R8(%rcx), %rax + or %rax, %rdx + neg R32(%rcx) +',` + shld R8(%rcx), %rax, %rdx +') + imul %rdx, %r8 + shr R8(%rcx), %r8 + mov %r8, 16(%rbx) C store B1modb +L(z): + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_2.asm b/gmp-6.3.0/mpn/x86_64/mod_1_2.asm new file mode 100644 index 0000000..40fcaeb --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_1_2.asm @@ -0,0 +1,241 @@ +dnl AMD64 mpn_mod_1s_2p + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4 +C AMD K10 4 +C Intel P4 19 +C Intel core2 8 +C Intel NHM 6.5 +C Intel SBR 4.5 +C Intel atom 28 +C VIA nano 8 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_2p) + FUNC_ENTRY(4) + push %r14 + test $1, R8(%rsi) + mov %rdx, %r14 + push %r13 + mov %rcx, %r13 + push %r12 + push %rbp + push %rbx + mov 16(%rcx), %r10 + mov 24(%rcx), %rbx + mov 32(%rcx), %rbp + je L(b0) + dec %rsi + je L(one) + mov -8(%rdi,%rsi,8), %rax + mul %r10 + mov %rax, %r9 + mov %rdx, %r8 + mov (%rdi,%rsi,8), %rax + add -16(%rdi,%rsi,8), %r9 + adc $0, %r8 + mul %rbx + add %rax, %r9 + adc %rdx, %r8 + jmp L(11) + +L(b0): mov -8(%rdi,%rsi,8), %r8 + mov -16(%rdi,%rsi,8), %r9 + +L(11): sub $4, %rsi + jb L(ed2) + lea 40(%rdi,%rsi,8), %rdi + mov -40(%rdi), %r11 + mov -32(%rdi), %rax + jmp L(m0) + + ALIGN(16) +L(top): mov -24(%rdi), %r9 + add %rax, %r11 + mov -16(%rdi), %rax + adc %rdx, %r12 + mul %r10 + add %rax, %r9 + mov %r11, %rax + mov %rdx, %r8 + adc $0, %r8 + mul %rbx + add %rax, %r9 + mov %r12, %rax + adc %rdx, %r8 + mul %rbp + sub $2, %rsi + jb L(ed1) + mov -40(%rdi), %r11 + add %rax, %r9 + mov -32(%rdi), %rax + adc %rdx, %r8 +L(m0): mul %r10 + add %rax, %r11 + mov %r9, %rax + mov %rdx, %r12 + adc $0, %r12 + mul %rbx + add %rax, %r11 + lea -32(%rdi), %rdi C ap -= 4 + mov %r8, %rax + adc %rdx, %r12 + mul %rbp + sub $2, %rsi + jae L(top) + +L(ed0): mov %r11, %r9 + mov %r12, %r8 +L(ed1): add %rax, %r9 + adc %rdx, %r8 +L(ed2): mov 8(%r13), R32(%rdi) C cnt + mov %r8, %rax + mov %r9, %r8 + mul %r10 + add %rax, %r8 + adc $0, %rdx +L(1): xor R32(%rcx), R32(%rcx) + mov %r8, %r9 + sub R32(%rdi), R32(%rcx) + shr R8(%rcx), %r9 + mov R32(%rdi), R32(%rcx) + sal R8(%rcx), %rdx + or %rdx, %r9 + sal R8(%rcx), %r8 + mov %r9, %rax + mulq (%r13) + mov %rax, %rsi + inc %r9 + add %r8, %rsi + adc %r9, %rdx + imul %r14, %rdx + sub %rdx, %r8 + lea (%r8,%r14), %rax + cmp %r8, %rsi + cmovc %rax, %r8 + mov %r8, %rax + sub %r14, %rax + cmovc %r8, %rax + mov R32(%rdi), R32(%rcx) + shr R8(%rcx), %rax + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + FUNC_EXIT() + ret +L(one): + mov (%rdi), %r8 + mov 8(%rcx), R32(%rdi) + xor %rdx, %rdx + jmp L(1) +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_2p_cps) + FUNC_ENTRY(2) + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, R32(%rcx) + mov %rsi, %r12 + mov R32(%rcx), R32(%rbp) C preserve cnt over call + sal R8(%rcx), %r12 C b << cnt +IFSTD(` mov %r12, %rdi ') C pass parameter +IFDOS(` mov %r12, %rcx ') C pass parameter +IFDOS(` sub $32, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + mov %r12, %r8 + mov %rax, %r11 + mov %rax, (%rbx) C store bi + mov %rbp, 8(%rbx) C store cnt + neg %r8 + mov R32(%rbp), R32(%rcx) + mov $1, R32(%rsi) +ifdef(`SHLD_SLOW',` + shl R8(%rcx), %rsi + neg R32(%rcx) + mov %rax, %rbp + shr R8(%rcx), %rax + or %rax, %rsi + mov %rbp, %rax + neg R32(%rcx) +',` + shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano +') + imul %r8, %rsi + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 16(%rbx) C store B1modb + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 24(%rbx) C store B2modb + + not %rdx + imul %r12, %rdx + add %rdx, %r12 + cmp %rdx, %rax + cmovnc %rdx, %r12 + + shr R8(%rcx), %r12 + mov %r12, 32(%rbx) C store B3modb + + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mod_1_4.asm b/gmp-6.3.0/mpn/x86_64/mod_1_4.asm new file mode 100644 index 0000000..6cf304c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_1_4.asm @@ -0,0 +1,272 @@ +dnl AMD64 mpn_mod_1s_4p + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3 +C AMD K10 3 +C Intel P4 15.5 +C Intel core2 5 +C Intel corei 4 +C Intel atom 23 +C VIA nano 4.75 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p) + FUNC_ENTRY(4) + push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov %rdx, %r15 + mov %rcx, %r14 + mov 16(%rcx), %r11 C B1modb + mov 24(%rcx), %rbx C B2modb + mov 32(%rcx), %rbp C B3modb + mov 40(%rcx), %r13 C B4modb + mov 48(%rcx), %r12 C B5modb + xor R32(%r8), R32(%r8) + mov R32(%rsi), R32(%rdx) + and $3, R32(%rdx) + je L(b0) + cmp $2, R32(%rdx) + jc L(b1) + je L(b2) + +L(b3): lea -24(%rdi,%rsi,8), %rdi + mov 8(%rdi), %rax + mul %r11 + mov (%rdi), %r9 + add %rax, %r9 + adc %rdx, %r8 + mov 16(%rdi), %rax + mul %rbx + jmp L(m0) + + ALIGN(8) +L(b0): lea -32(%rdi,%rsi,8), %rdi + mov 8(%rdi), %rax + mul %r11 + mov (%rdi), %r9 + add %rax, %r9 + adc %rdx, %r8 + mov 16(%rdi), %rax + mul %rbx + add %rax, %r9 + adc %rdx, %r8 + mov 24(%rdi), %rax + mul %rbp + jmp L(m0) + + ALIGN(8) +L(b1): lea -8(%rdi,%rsi,8), %rdi + mov (%rdi), %r9 + jmp L(m1) + + ALIGN(8) +L(b2): lea -16(%rdi,%rsi,8), %rdi + mov 8(%rdi), %r8 + mov (%rdi), %r9 + jmp L(m1) + + ALIGN(16) +L(top): mov -24(%rdi), %rax + mov -32(%rdi), %r10 + mul %r11 C up[1] * B1modb + add %rax, %r10 + mov -16(%rdi), %rax + mov $0, R32(%rcx) + adc %rdx, %rcx + mul %rbx C up[2] * B2modb + add %rax, %r10 + mov -8(%rdi), %rax + adc %rdx, %rcx + sub $32, %rdi + mul %rbp C up[3] * B3modb + add %rax, %r10 + mov %r13, %rax + adc %rdx, %rcx + mul %r9 C rl * B4modb + add %rax, %r10 + mov %r12, %rax + adc %rdx, %rcx + mul %r8 C rh * B5modb + mov %r10, %r9 + mov %rcx, %r8 +L(m0): add %rax, %r9 + adc %rdx, %r8 +L(m1): sub $4, %rsi + ja L(top) + +L(end): mov 8(%r14), R32(%rsi) + mov %r8, %rax + mul %r11 + mov %rax, %r8 + add %r9, %r8 + adc $0, %rdx + xor R32(%rcx), R32(%rcx) + sub R32(%rsi), R32(%rcx) + mov %r8, %rdi + shr R8(%rcx), %rdi + mov R32(%rsi), R32(%rcx) + sal R8(%rcx), %rdx + or %rdx, %rdi + mov %rdi, %rax + mulq (%r14) + mov %r15, %rbx + mov %rax, %r9 + sal R8(%rcx), %r8 + inc %rdi + add %r8, %r9 + adc %rdi, %rdx + imul %rbx, %rdx + sub %rdx, %r8 + lea (%r8,%rbx), %rax + cmp %r8, %r9 + cmovc %rax, %r8 + mov %r8, %rax + sub %rbx, %rax + cmovc %r8, %rax + shr R8(%rcx), %rax + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p_cps) + FUNC_ENTRY(2) + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, R32(%rcx) + mov %rsi, %r12 + mov R32(%rcx), R32(%rbp) C preserve cnt over call + sal R8(%rcx), %r12 C b << cnt +IFSTD(` mov %r12, %rdi ') C pass parameter +IFDOS(` mov %r12, %rcx ') C pass parameter +IFDOS(` sub $32, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_invert_limb) +IFDOS(` add $32, %rsp ') + mov %r12, %r8 + mov %rax, %r11 + mov %rax, (%rbx) C store bi + mov %rbp, 8(%rbx) C store cnt + neg %r8 + mov R32(%rbp), R32(%rcx) + mov $1, R32(%rsi) +ifdef(`SHLD_SLOW',` + shl R8(%rcx), %rsi + neg R32(%rcx) + mov %rax, %rbp + shr R8(%rcx), %rax + or %rax, %rsi + mov %rbp, %rax + neg R32(%rcx) +',` + shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano +') + imul %r8, %rsi + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 16(%rbx) C store B1modb + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 24(%rbx) C store B2modb + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 32(%rbx) C store B3modb + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr R8(%rcx), %rsi + mov %rsi, 40(%rbx) C store B4modb + + not %rdx + imul %r12, %rdx + add %rdx, %r12 + cmp %rdx, %rax + cmovnc %rdx, %r12 + + shr R8(%rcx), %r12 + mov %r12, 48(%rbx) C store B5modb + + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm new file mode 100644 index 0000000..75421a6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm @@ -0,0 +1,215 @@ +dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. + +dnl Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way +C AMD K10 0.67 this seems hard to beat +C AMD bd1 1 +C AMD bd2 1 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 0.62 +C AMD bobcat 1.07 +C AMD jaguar 1 +C Intel P4 7.35 terrible, use old code +C Intel core2 1.25 1+epsilon with huge unrolling +C Intel NHM 1.15 this seems hard to beat +C Intel SBR 0.93 +C Intel IBR 0.93 +C Intel HWL 0.82 +C Intel BWL 0.64 +C Intel SKY 0.60 +C Intel atom 2.5 +C Intel SLM 1.59 +C VIA nano 1.25 this seems hard to beat + +C INPUT PARAMETERS +define(`ap', %rdi) +define(`n', %rsi) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Review feed-in and wind-down code. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + FUNC_ENTRY(2) + + mov $0x0000FFFFFFFFFFFF, %r11 + + mov (ap), %rax + + cmp $2, %rsi + ja L(gt2) + + jb L(one) + + mov 8(ap), %rsi + mov %rax, %rdx + shr $48, %rax C src[0] low + + and %r11, %rdx C src[0] high + add %rdx, %rax + mov R32(%rsi), R32(%rdx) + + shr $32, %rsi C src[1] high + add %rsi, %rax + + shl $16, %rdx C src[1] low + add %rdx, %rax +L(one): FUNC_EXIT() + ret + + +C Don't change this, the wind-down code is not able to handle greater values +define(UNROLL,3) + +L(gt2): mov 8(ap), %rcx + mov 16(ap), %rdx + xor %r9, %r9 + add $24, ap + sub $eval(UNROLL*3+3), %rsi + jc L(end) + ALIGN(16) +L(top): + add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 +forloop(i,1,UNROLL-1,`dnl + add eval(i*24)(ap), %rax + adc eval(i*24+8)(ap), %rcx + adc eval(i*24+16)(ap), %rdx + adc $0, %r9 +')dnl + add $eval(UNROLL*24), ap + sub $eval(UNROLL*3), %rsi + jnc L(top) + +L(end): + lea L(tab)(%rip), %r8 +ifdef(`PIC', +` movslq 36(%r8,%rsi,4), %r10 + add %r10, %r8 + jmp *%r8 +',` + jmp *72(%r8,%rsi,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) + JMPENT( L(8), L(tab)) + TEXT + +L(6): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(3): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + jmp L(cj1) + +L(7): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(4): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(1): add (ap), %rax + adc $0, %rcx + jmp L(cj2) + +L(8): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(5): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(2): add (ap), %rax + adc 8(ap), %rcx + +L(cj2): adc $0, %rdx +L(cj1): adc $0, %r9 +L(0): add %r9, %rax + adc $0, %rcx + adc $0, %rdx + adc $0, %rax + + mov %rax, %rdi C 0mod3 + shr $48, %rax C 0mod3 high + + and %r11, %rdi C 0mod3 low + mov R32(%rcx), R32(%r10) C 1mod3 + + shr $32, %rcx C 1mod3 high + + add %rdi, %rax C apply 0mod3 low + movzwl %dx, R32(%rdi) C 2mod3 + shl $16, %r10 C 1mod3 low + + add %rcx, %rax C apply 1mod3 high + shr $16, %rdx C 2mod3 high + + add %r10, %rax C apply 1mod3 low + shl $32, %rdi C 2mod3 low + + add %rdx, %rax C apply 2mod3 high + add %rdi, %rax C apply 2mod3 low + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mode1o.asm b/gmp-6.3.0/mpn/x86_64/mode1o.asm new file mode 100644 index 0000000..2cd2b08 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mode1o.asm @@ -0,0 +1,171 @@ +dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder. + +dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 10 +C AMD K10 10 +C Intel P4 33 +C Intel core2 13 +C Intel corei 14.5 +C Intel atom 35 +C VIA nano ? + + +C The dependent chain in the main loop is +C +C cycles +C sub %rdx, %rax 1 +C imul %r9, %rax 4 +C mul %r8 5 +C ---- +C total 10 +C +C The mov load from src seems to need to be scheduled back before the jz to +C achieve this speed, out-of-order execution apparently can't completely hide +C the latency otherwise. +C +C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it +C for the first iteration (where there's no cbit). +C +C The code alignment used (32-byte) for the loop also seems necessary. Without +C that the non-PIC case has adc crossing the 0x60 offset, apparently making it +C run at 11 cycles instead of 10. + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_modexact_1_odd) + FUNC_ENTRY(3) + mov $0, R32(%rcx) +IFDOS(` jmp L(ent) ') + +PROLOGUE(mpn_modexact_1c_odd) + FUNC_ENTRY(4) +L(ent): + C rdi src + C rsi size + C rdx divisor + C rcx carry + + mov %rdx, %r8 C d + shr R32(%rdx) C d/2 + + LEA( binvert_limb_table, %r9) + + and $127, R32(%rdx) + mov %rcx, %r10 C initial carry + + movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits + + mov (%rdi), %rax C src[0] + lea (%rdi,%rsi,8), %r11 C src end + mov %r8, %rdi C d, made available to imull + + lea (%rdx,%rdx), R32(%rcx) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + + neg %rsi C -size + + imul R32(%rdi), R32(%rdx) C inv*inv*d + + sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rcx,%rcx), R32(%rdx) C 2*inv + imul R32(%rcx), R32(%rcx) C inv*inv + + imul R32(%rdi), R32(%rcx) C inv*inv*d + + sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits + xor R32(%rcx), R32(%rcx) C initial cbit + + lea (%rdx,%rdx), %r9 C 2*inv + imul %rdx, %rdx C inv*inv + + imul %r8, %rdx C inv*inv*d + + sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits + mov %r10, %rdx C initial climb + + ASSERT(e,` C d*inv == 1 mod 2^64 + mov %r8, %r10 + imul %r9, %r10 + cmp $1, %r10') + + inc %rsi + jz L(one) + + + ALIGN(16) +L(top): + C rax l = src[i]-cbit + C rcx new cbit, 0 or 1 + C rdx climb, high of last product + C rsi counter, limbs, negative + C rdi + C r8 divisor + C r9 inverse + C r11 src end ptr + + sub %rdx, %rax C l = src[i]-cbit - climb + + adc $0, %rcx C more cbit + imul %r9, %rax C q = l * inverse + + mul %r8 C climb = high (q * d) + + mov (%r11,%rsi,8), %rax C src[i+1] + sub %rcx, %rax C next l = src[i+1] - cbit + setc R8(%rcx) C new cbit + + inc %rsi + jnz L(top) + + +L(one): + sub %rdx, %rax C l = src[i]-cbit - climb + + adc $0, %rcx C more cbit + imul %r9, %rax C q = l * inverse + + mul %r8 C climb = high (q * d) + + lea (%rcx,%rdx), %rax C climb+cbit + FUNC_EXIT() + ret + +EPILOGUE(mpn_modexact_1c_odd) +EPILOGUE(mpn_modexact_1_odd) diff --git a/gmp-6.3.0/mpn/x86_64/mul_1.asm b/gmp-6.3.0/mpn/x86_64/mul_1.asm new file mode 100644 index 0000000..e1ba89b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mul_1.asm @@ -0,0 +1,192 @@ +dnl AMD64 mpn_mul_1. + +dnl Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.54 +C AMD K10 2.54 +C AMD bull 4.98 +C AMD pile 4.80 +C AMD steam +C AMD excavator +C AMD bobcat 5.37 +C AMD jaguar 6.16 +C Intel P4 12.6 +C Intel core2 4.05 +C Intel NHM 4.0 +C Intel SBR 2.91 +C Intel IBR 2.73 +C Intel HWL 2.44 +C Intel BWL 2.39 +C Intel SKL 2.44 +C Intel atom 19.8 +C Intel SLM 9.0 +C VIA nano 4.25 + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * The loop is great, but the prologue and epilogue code was quickly written. +C Tune it! + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vl', `%rcx') C r9 + +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`vl', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + push %rbx +IFSTD(` mov %r8, %r10') +IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns) + jmp L(common) +EPILOGUE() + +PROLOGUE(mpn_mul_1) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + push %rbx + xor %r10, %r10 +L(common): + mov (up), %rax C read first u limb early +IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') + mul vl +IFSTD(` mov %rbx, n ') + + add %r10, %rax + adc $0, %rdx + + and $3, R32(%rbx) + jz L(b0) + cmp $2, R32(%rbx) + jz L(b2) + jg L(b3) + +L(b1): dec n + jne L(gt1) + mov %rax, (rp) + jmp L(ret) +L(gt1): lea 8(up,n,8), up + lea -8(rp,n,8), rp + neg n + xor %r10, %r10 + xor R32(%rbx), R32(%rbx) + mov %rax, %r9 + mov (up,n,8), %rax + mov %rdx, %r8 + jmp L(L1) + +L(b0): lea (up,n,8), up + lea -16(rp,n,8), rp + neg n + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp L(L0) + +L(b3): lea -8(up,n,8), up + lea -24(rp,n,8), rp + neg n + mov %rax, %rbx + mov %rdx, %r10 + jmp L(L3) + +L(b2): lea -16(up,n,8), up + lea -32(rp,n,8), rp + neg n + xor %r8, %r8 + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov 24(up,n,8), %rax + mov %rdx, %r9 + jmp L(L2) + + ALIGN(16) +L(top): mov %r10, (rp,n,8) + add %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 + mov $0, R32(%r10) +L(L1): mul vl + mov %r9, 8(rp,n,8) + add %rax, %r8 + adc %rdx, %rbx +L(L0): mov 8(up,n,8), %rax + mul vl + mov %r8, 16(rp,n,8) + add %rax, %rbx + adc %rdx, %r10 +L(L3): mov 16(up,n,8), %rax + mul vl + mov %rbx, 24(rp,n,8) + mov $0, R32(%r8) C zero + mov %r8, %rbx C zero + add %rax, %r10 + mov 24(up,n,8), %rax + mov %r8, %r9 C zero + adc %rdx, %r9 +L(L2): mul vl + add $4, n + js L(top) + + mov %r10, (rp,n,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(rp,n,8) + add %r8, %rdx +L(ret): mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mul_2.asm b/gmp-6.3.0/mpn/x86_64/mul_2.asm new file mode 100644 index 0000000..d64313b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mul_2.asm @@ -0,0 +1,204 @@ +dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and +dnl store the result in a third limb vector. + +dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.53 +C AMD K10 4.53 +C AMD bull 9.76 10.37 +C AMD pile 9.22 +C AMD steam +C AMD excavator +C AMD bobcat 11.3 +C AMD jaguar 11.9 +C Intel P4 25.0 +C Intel core2 8.05 +C Intel NHM 7.72 +C Intel SBR 6.33 +C Intel IBR 6.15 +C Intel HWL 6.00 +C Intel BWL 4.44 +C Intel SKL 4.54 +C Intel atom 39.0 +C Intel SLM 24.0 +C VIA nano + +C This code is the result of running a code generation and optimization tool +C suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Work on feed-in and wind-down code. +C * Convert "mov $0" to "xor". +C * Adjust initial lea to save some bytes. +C * Perhaps adjust n from n_param&3 value? +C * Replace with 2.25 c/l sequence. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + + mov n_param, n + neg n + lea -8(up,n_param,8), up + lea -8(rp,n_param,8), rp + + and $3, R32(n_param) + jz L(m2p0) + cmp $2, R32(n_param) + jc L(m2p1) + jz L(m2p2) +L(m2p3): + mul v0 + xor R32(w3), R32(w3) + mov %rax, w1 + mov %rdx, w2 + mov 8(up,n,8), %rax + add $-1, n + mul v1 + add %rax, w2 + jmp L(m23) +L(m2p0): + mul v0 + xor R32(w2), R32(w2) + mov %rax, w0 + mov %rdx, w1 + jmp L(m20) +L(m2p1): + mul v0 + xor R32(w3), R32(w3) + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + add $1, n + jmp L(m2top) +L(m2p2): + mul v0 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov %rax, w2 + mov %rdx, w3 + mov 8(up,n,8), %rax + add $-2, n + jmp L(m22) + + + ALIGN(32) +L(m2top): + add %rax, w3 + adc %rdx, w0 + mov 0(up,n,8), %rax + adc $0, R32(w1) + mov $0, R32(w2) + mul v1 + add %rax, w0 + mov w3, 0(rp,n,8) + adc %rdx, w1 + mov 8(up,n,8), %rax + mul v0 + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) +L(m20): mov 8(up,n,8), %rax + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 16(up,n,8), %rax + mov $0, R32(w3) + mul v0 + add %rax, w1 + mov 16(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 + add %rax, w2 + mov w0, 8(rp,n,8) +L(m23): adc %rdx, w3 + mov 24(up,n,8), %rax + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov w1, 16(rp,n,8) + mov 24(up,n,8), %rax + mov $0, R32(w1) + adc $0, R32(w0) +L(m22): mul v1 + add %rax, w3 + mov w2, 24(rp,n,8) + adc %rdx, w0 + mov 32(up,n,8), %rax + mul v0 + add $4, n + js L(m2top) + + + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + mov (up), %rax + mul v1 + mov w3, (rp) + add %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm new file mode 100644 index 0000000..9ceb611 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mulx/adx/addmul_1.asm @@ -0,0 +1,157 @@ +dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 - +C AMD zen ? +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel PNR - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL - +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C VIA nano - + +define(`rp', `%rdi') dnl rcx +define(`up', `%rsi') dnl rdx +define(`n_param', `%rdx') dnl r8 +define(`v0_param',`%rcx') dnl r9 + +define(`n', `%rcx') dnl +define(`v0', `%rdx') dnl + +C Testing mechanism for running this on older AMD64 processors +ifelse(FAKE_MULXADX,1,` + include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4') +',` + define(`adox', ``adox' $1, $2') + define(`adcx', ``adcx' $1, $2') + define(`mulx', ``mulx' $1, $2, $3') +') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_1) + mov (up), %r8 + + push %rbx + push %r12 + push %r13 + + lea (up,n_param,8), up + lea -16(rp,n_param,8), rp + mov R32(n_param), R32(%rax) + xchg v0_param, v0 C FIXME: is this insn fast? + + neg n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jl L(b1) + jz L(b2) + +L(b3): mulx( (up,n,8), %r11, %r10) + mulx( 8(up,n,8), %r13, %r12) + mulx( 16(up,n,8), %rbx, %rax) + dec n + jmp L(lo3) + +L(b0): mulx( (up,n,8), %r9, %r8) + mulx( 8(up,n,8), %r11, %r10) + mulx( 16(up,n,8), %r13, %r12) + jmp L(lo0) + +L(b2): mulx( (up,n,8), %r13, %r12) + mulx( 8(up,n,8), %rbx, %rax) + lea 2(n), n + jrcxz L(wd2) +L(gt2): mulx( (up,n,8), %r9, %r8) + jmp L(lo2) + +L(b1): and R8(%rax), R8(%rax) + mulx( (up,n,8), %rbx, %rax) + lea 1(n), n + jrcxz L(wd1) + mulx( (up,n,8), %r9, %r8) + mulx( 8(up,n,8), %r11, %r10) + jmp L(lo1) + +L(end): adcx( %r10, %r13) + mov %r11, -8(rp) +L(wd2): adox( (rp), %r13) + adcx( %r12, %rbx) + mov %r13, (rp) +L(wd1): adox( 8(rp), %rbx) + adcx( %rcx, %rax) + adox( %rcx, %rax) + mov %rbx, 8(rp) + pop %r13 + pop %r12 + pop %rbx + ret + +L(top): jrcxz L(end) + mulx( (up,n,8), %r9, %r8) + adcx( %r10, %r13) + mov %r11, -8(rp,n,8) +L(lo2): adox( (rp,n,8), %r13) + mulx( 8(up,n,8), %r11, %r10) + adcx( %r12, %rbx) + mov %r13, (rp,n,8) +L(lo1): adox( 8(rp,n,8), %rbx) + mulx( 16(up,n,8), %r13, %r12) + adcx( %rax, %r9) + mov %rbx, 8(rp,n,8) +L(lo0): adox( 16(rp,n,8), %r9) + mulx( 24(up,n,8), %rbx, %rax) + adcx( %r8, %r11) + mov %r9, 16(rp,n,8) +L(lo3): adox( 24(rp,n,8), %r11) + lea 4(n), n + jmp L(top) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/nano/copyd.asm b/gmp-6.3.0/mpn/x86_64/nano/copyd.asm new file mode 100644 index 0000000..f0dc54a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/nano/copyi.asm b/gmp-6.3.0/mpn/x86_64/nano/copyi.asm new file mode 100644 index 0000000..9c26e00 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for Intel Sandy Bridge. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm b/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm new file mode 100644 index 0000000..e9a0763 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/dive_1.asm @@ -0,0 +1,166 @@ +dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C norm unorm +C AMD K8,K9 11 11 +C AMD K10 11 11 +C Intel P4 ? +C Intel core2 13.5 13.25 +C Intel corei 14.25 +C Intel atom 34 36 +C VIA nano 19.25 19.25 + + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx +C divisor rcx + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_divexact_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C shift count + mov %rdx, %r8 + + bt $0, R32(%rax) + jc L(odd) C skip bsfq unless divisor is even + bsf %rax, %rcx + shr R8(%rcx), %rax +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r10 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits + + lea (%rsi,%r8,8), %rsi C up end + lea -8(%rdi,%r8,8), %rdi C rp end + neg %r8 C -n + + mov (%rsi,%r8,8), %rax C up[0] + + inc %r8 + jz L(one) + + test R32(%rcx), R32(%rcx) + jnz L(unorm) C branch if count != 0 + xor R32(%rbx), R32(%rbx) + jmp L(nent) + + ALIGN(8) +L(ntop):mul %r11 C carry limb in rdx 0 10 + mov -8(%rsi,%r8,8), %rax C + sub %rbx, %rax C apply carry bit + setc %bl C + sub %rdx, %rax C apply carry limb 5 + adc $0, %rbx C 6 +L(nent):imul %r10, %rax C 6 + mov %rax, (%rdi,%r8,8) C + inc %r8 C + jnz L(ntop) + + mov -8(%rsi), %r9 C up high limb + jmp L(com) + +L(unorm): + mov (%rsi,%r8,8), %r9 C up[1] + shr R8(%rcx), %rax C + neg R32(%rcx) + shl R8(%rcx), %r9 C + neg R32(%rcx) + or %r9, %rax + xor R32(%rbx), R32(%rbx) + jmp L(uent) + + ALIGN(8) +L(utop):mul %r11 C carry limb in rdx 0 10 + mov (%rsi,%r8,8), %rax C + shl R8(%rcx), %rax C + neg R32(%rcx) + or %r9, %rax + sub %rbx, %rax C apply carry bit + setc %bl C + sub %rdx, %rax C apply carry limb 5 + adc $0, %rbx C 6 +L(uent):imul %r10, %rax C 6 + mov (%rsi,%r8,8), %r9 C + shr R8(%rcx), %r9 C + neg R32(%rcx) + mov %rax, (%rdi,%r8,8) C + inc %r8 C + jnz L(utop) + +L(com): mul %r11 C carry limb in rdx + sub %rbx, %r9 C apply carry bit + sub %rdx, %r9 C apply carry limb + imul %r10, %r9 + mov %r9, (%rdi) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h new file mode 100644 index 0000000..fde69db --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/gmp-mparam.h @@ -0,0 +1,243 @@ +/* VIA Nano gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#define SHLD_SLOW 1 +#define SHRD_SLOW 1 + +/* 1600 MHz Nano 2xxx */ +/* FFT tuning limit = 25000000 */ +/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define MUL_TOOM22_THRESHOLD 27 +#define MUL_TOOM33_THRESHOLD 38 +#define MUL_TOOM44_THRESHOLD 324 +#define MUL_TOOM6H_THRESHOLD 450 +#define MUL_TOOM8H_THRESHOLD 632 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 207 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 211 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 219 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 315 + +#define SQR_BASECASE_THRESHOLD 10 +#define SQR_TOOM2_THRESHOLD 52 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 387 +#define SQR_TOOM6_THRESHOLD 662 +#define SQR_TOOM8_THRESHOLD 781 + +#define MULMID_TOOM42_THRESHOLD 32 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 376 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 376, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 43,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 79,11}, { 47,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 143,11}, { 79,10}, \ + { 159, 9}, { 319,10}, { 175,11}, { 95, 9}, \ + { 383, 8}, { 767,10}, { 207,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143, 9}, \ + { 575, 8}, { 1151,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639, 8}, { 1279,10}, { 335,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831, 8}, { 1663,10}, \ + { 447,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511, 9}, { 1023,11}, { 271,10}, { 543, 9}, \ + { 1087,10}, { 575, 9}, { 1215,12}, { 159,11}, \ + { 319,10}, { 639, 9}, { 1279,11}, { 335,10}, \ + { 671, 9}, { 1343,11}, { 351,10}, { 703, 9}, \ + { 1407,12}, { 191,11}, { 383,10}, { 767, 9}, \ + { 1535,10}, { 831, 9}, { 1663,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 671,10}, { 1343,12}, { 351,11}, { 703,10}, \ + { 1407,13}, { 191,12}, { 383,11}, { 767,10}, \ + { 1535,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,10}, { 1791,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,13}, { 447,12}, { 895,11}, { 1791,13}, \ + { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \ + { 575,12}, { 1151,11}, { 2303,12}, { 1215,13}, \ + { 639,12}, { 1279,11}, { 2559,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \ + { 1791,13}, { 959,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,14}, { 639,13}, { 1279,12}, \ + { 2559,13}, { 1407,12}, { 2815,13}, { 1471,14}, \ + { 767,13}, { 1535,12}, { 3071,13}, { 1663,14}, \ + { 895,13}, { 1791,12}, { 3583,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2047,12}, { 4095,13}, \ + { 2175,14}, { 1151,13}, { 2303,12}, { 4607,13}, \ + { 2431,14}, { 1279,13}, { 2559,12}, { 5119,14}, \ + { 1407,13}, { 2815,12}, { 5631,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 224 +#define MUL_FFT_THRESHOLD 3520 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 9}, { 127,10}, { 71, 9}, \ + { 143,10}, { 79,11}, { 47,10}, { 95, 9}, \ + { 191,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135, 7}, { 1087, 9}, \ + { 287,11}, { 79, 9}, { 319, 8}, { 639,10}, \ + { 167,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511, 8}, { 1023,10}, { 271, 9}, \ + { 543, 8}, { 1087,11}, { 143, 9}, { 575, 8}, \ + { 1151,10}, { 303, 9}, { 639, 8}, { 1279,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 703,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,11}, \ + { 271,10}, { 543, 9}, { 1087,10}, { 575, 9}, \ + { 1151,11}, { 303,10}, { 607, 9}, { 1215,12}, \ + { 159,11}, { 319,10}, { 639, 9}, { 1279,10}, \ + { 671, 9}, { 1343,11}, { 351,10}, { 703, 9}, \ + { 1407,12}, { 191,11}, { 383,10}, { 767, 9}, \ + { 1535,11}, { 415,10}, { 831, 9}, { 1663,12}, \ + { 223,11}, { 447,10}, { 959,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 575,10}, { 1215,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,10}, { 1343,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 767,10}, { 1535,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 447,11}, { 895,10}, \ + { 1791,12}, { 479,11}, { 959,14}, { 127,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,13}, \ + { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1407,11}, { 2815,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \ + { 1791,13}, { 959,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1215,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2047,12}, \ + { 4095,13}, { 2175,14}, { 1151,13}, { 2303,12}, \ + { 4607,14}, { 1279,13}, { 2559,14}, { 1407,13}, \ + { 2815,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 230 +#define SQR_FFT_THRESHOLD 2496 + +#define MULLO_BASECASE_THRESHOLD 13 +#define MULLO_DC_THRESHOLD 38 +#define MULLO_MUL_N_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 56 +#define DC_DIVAPPR_Q_THRESHOLD 173 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 96 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 202 +#define INV_APPR_THRESHOLD 166 + +#define BINV_NEWTON_THRESHOLD 246 +#define REDC_1_TO_REDC_2_THRESHOLD 7 +#define REDC_2_TO_REDC_N_THRESHOLD 85 + +#define MU_DIV_QR_THRESHOLD 1499 +#define MU_DIVAPPR_Q_THRESHOLD 1652 +#define MUPI_DIV_QR_THRESHOLD 83 +#define MU_BDIV_QR_THRESHOLD 1210 +#define MU_BDIV_Q_THRESHOLD 1499 + +#define POWM_SEC_TABLE 1,28,129,642,2387 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD_THRESHOLD 127 +#define HGCD_APPR_THRESHOLD 214 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 487 +#define GCDEXT_DC_THRESHOLD 505 +#define JACOBI_BASE_METHOD 4 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 24 +#define SET_STR_DC_THRESHOLD 802 +#define SET_STR_PRECOMPUTE_THRESHOLD 2042 + +#define FAC_DSC_THRESHOLD 1737 +#define FAC_ODD_THRESHOLD 44 diff --git a/gmp-6.3.0/mpn/x86_64/nano/popcount.asm b/gmp-6.3.0/mpn/x86_64/nano/popcount.asm new file mode 100644 index 0000000..fb14dd3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/nano/popcount.asm @@ -0,0 +1,35 @@ +dnl x86-64 mpn_popcount. + +dnl Copyright 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm b/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm new file mode 100644 index 0000000..7ae6a1a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/addmul_2.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_2 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_2) +include_mpn(`x86_64/bd1/addmul_2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm new file mode 100644 index 0000000..8e6ee1b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aors_n.asm @@ -0,0 +1,196 @@ +dnl x86-64 mpn_add_n/mpn_sub_n optimized for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.8 +C AMD K10 2.8 +C Intel P4 4 +C Intel core2 3.6-5 (fluctuating) +C Intel corei ? +C Intel atom ? +C VIA nano ? + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_add_n', ` + define(ADDSUB, add) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADDSUB, sub) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +ASM_START() + TEXT +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +IFDOS(` jmp L(ent) ') +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +L(ent): push %rbx + push %r12 + + mov (vp), %r9 + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jne L(n00) C n = 0, 4, 8, ... + mov R32(%r8), R32(%rbx) + mov (up), %r8 + mov 8(up), %r10 + ADDSUB %r9, %r8 + mov 8(vp), %r9 + setc R8(%rax) + lea -16(rp), rp + jmp L(L00) + +L(n00): cmp $2, R32(%rax) + jnc L(n01) C n = 1, 5, 9, ... + mov (up), %r11 + mov R32(%r8), R32(%rax) + xor R32(%rbx), R32(%rbx) + dec n + jnz L(gt1) + ADDSUB %r9, %r11 + setc R8(%rbx) + ADDSUB %rax, %r11 + adc $0, R32(%rbx) + mov %r11, (rp) + jmp L(ret) +L(gt1): mov 8(up), %r8 + ADDSUB %r9, %r11 + mov 8(vp), %r9 + setc R8(%rbx) + lea -8(rp), rp + lea 8(up), up + lea 8(vp), vp + jmp L(L01) + +L(n01): jne L(n10) C n = 2, 6, 10, ... + mov (up), %r12 + mov R32(%r8), R32(%rbx) + mov 8(up), %r11 + ADDSUB %r9, %r12 + mov 8(vp), %r9 + setc R8(%rax) + lea -32(rp), rp + lea 16(up), up + lea 16(vp), vp + jmp L(L10) + +L(n10): mov (up), %r10 C n = 3, 7, 11, ... + mov R32(%r8), R32(%rax) + xor R32(%rbx), R32(%rbx) + mov 8(up), %r12 + ADDSUB %r9, %r10 + mov 8(vp), %r9 + setc R8(%rbx) + lea -24(rp), rp + lea -8(up), up + lea -8(vp), vp + jmp L(L11) + +L(c0): mov $1, R8(%rbx) + jmp L(rc0) +L(c1): mov $1, R8(%rax) + jmp L(rc1) +L(c2): mov $1, R8(%rbx) + jmp L(rc2) +L(c3): mov $1, R8(%rax) + jmp L(rc3) + + ALIGN(16) +L(top): mov (up), %r8 C not on critical path + ADDSUB %r9, %r11 C not on critical path + mov (vp), %r9 C not on critical path + setc R8(%rbx) C save carry out + mov %r12, (rp) +L(L01): ADDSUB %rax, %r11 C apply previous carry out + jc L(c0) C jump if ripple +L(rc0): mov 8(up), %r10 + ADDSUB %r9, %r8 + mov 8(vp), %r9 + setc R8(%rax) + mov %r11, 8(rp) +L(L00): ADDSUB %rbx, %r8 + jc L(c1) +L(rc1): mov 16(up), %r12 + ADDSUB %r9, %r10 + mov 16(vp), %r9 + setc R8(%rbx) + mov %r8, 16(rp) +L(L11): ADDSUB %rax, %r10 + jc L(c2) +L(rc2): mov 24(up), %r11 + ADDSUB %r9, %r12 + lea 32(up), up + mov 24(vp), %r9 + lea 32(vp), vp + setc R8(%rax) + mov %r10, 24(rp) +L(L10): ADDSUB %rbx, %r12 + jc L(c3) +L(rc3): lea 32(rp), rp + sub $4, n + ja L(top) + +L(end): ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r12, (rp) + ADDSUB %rax, %r11 + jnc L(1) + mov $1, R8(%rbx) +L(1): mov %r11, 8(rp) + +L(ret): mov R32(%rbx), R32(%rax) + pop %r12 + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm new file mode 100644 index 0000000..66937d3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh1_n.asm @@ -0,0 +1,50 @@ +dnl AMD64 mpn_addlsh1_n, mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1), +dnl optimised for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 31) C 31, not 63, since we use 32-bit ops + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_sublsh1_n', ` + define(ADDSUB, sub) + define(func, mpn_sublsh1_n)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) +include_mpn(`x86_64/pentium4/aorslshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm new file mode 100644 index 0000000..001f0ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslsh2_n.asm @@ -0,0 +1,50 @@ +dnl AMD64 mpn_addlsh2_n, mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2), +dnl optimised for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 30) C 30, not 62, since we use 32-bit ops + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_sublsh2_n', ` + define(ADDSUB, sub) + define(func, mpn_sublsh2_n)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n) +include_mpn(`x86_64/pentium4/aorslshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm new file mode 100644 index 0000000..d03c6a3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorslshC_n.asm @@ -0,0 +1,203 @@ +dnl AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where +dnl C is 1, 2, 3. Optimized for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +C cycles/limb +C AMD K8,K9 3.8 +C AMD K10 3.8 +C Intel P4 5.8 +C Intel core2 4.75 +C Intel corei 4.75 +C Intel atom ? +C VIA nano 4.75 + + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +define(M, eval(m4_lshift(1,LSH))) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + push %r12 + push %rbp + + mov (vp), %r9 + shl $LSH, %r9 + mov 4(vp), R32(%rbp) + + xor R32(%rbx), R32(%rbx) + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jne L(n00) C n = 0, 4, 8, ... + + mov (up), %r8 + mov 8(up), %r10 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r8 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rax) + mov 12(vp), R32(%rbp) + lea -16(rp), rp + jmp L(L00) + +L(n00): cmp $2, R32(%rax) + jnc L(n01) C n = 1, 5, 9, ... + mov (up), %r11 + lea -8(rp), rp + shr $RSH, R32(%rbp) + ADDSUB %r9, %r11 + setc R8(%rbx) + dec n + jz L(1) C jump for n = 1 + mov 8(up), %r8 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + mov 12(vp), R32(%rbp) + lea 8(up), up + lea 8(vp), vp + jmp L(L01) + +L(n01): jne L(n10) C n = 2, 6, 10, ... + mov (up), %r12 + mov 8(up), %r11 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r12 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rax) + mov 12(vp), R32(%rbp) + lea 16(up), up + lea 16(vp), vp + jmp L(L10) + +L(n10): mov (up), %r10 + mov 8(up), %r12 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r10 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rbx) + mov 12(vp), R32(%rbp) + lea -24(rp), rp + lea -8(up), up + lea -8(vp), vp + jmp L(L11) + +L(c0): mov $1, R8(%rbx) + jmp L(rc0) +L(c1): mov $1, R8(%rax) + jmp L(rc1) +L(c2): mov $1, R8(%rbx) + jmp L(rc2) + + ALIGN(16) +L(top): mov (up), %r8 C not on critical path + shr $RSH, R32(%rbp) + ADDSUB %r9, %r11 C not on critical path + mov (vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rbx) C save carry out + mov 4(vp), R32(%rbp) + mov %r12, (rp) + ADDSUB %rax, %r11 C apply previous carry out + jc L(c0) C jump if ripple +L(rc0): +L(L01): mov 8(up), %r10 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r8 + mov 8(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rax) + mov 12(vp), R32(%rbp) + mov %r11, 8(rp) + ADDSUB %rbx, %r8 + jc L(c1) +L(rc1): +L(L00): mov 16(up), %r12 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r10 + mov 16(vp), %r9 + lea (%rbp,%r9,M), %r9 + setc R8(%rbx) + mov 20(vp), R32(%rbp) + mov %r8, 16(rp) + ADDSUB %rax, %r10 + jc L(c2) +L(rc2): +L(L11): mov 24(up), %r11 + shr $RSH, R32(%rbp) + ADDSUB %r9, %r12 + mov 24(vp), %r9 + lea (%rbp,%r9,M), %r9 + lea 32(up), up + lea 32(vp), vp + setc R8(%rax) + mov -4(vp), R32(%rbp) + mov %r10, 24(rp) + ADDSUB %rbx, %r12 + jc L(c3) +L(rc3): lea 32(rp), rp +L(L10): sub $4, n + ja L(top) + +L(end): + shr $RSH, R32(%rbp) + ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r12, (rp) + ADDSUB %rax, %r11 + jnc L(1) + mov $1, R8(%rbx) +L(1): mov %r11, 8(rp) + lea (%rbx,%rbp), R32(%rax) + pop %rbp + pop %r12 + pop %rbx + FUNC_EXIT() + ret +L(c3): mov $1, R8(%rax) + jmp L(rc3) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm new file mode 100644 index 0000000..e5dbb34 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/aorsmul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) +include_mpn(`x86_64/bd1/aorsmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h new file mode 100644 index 0000000..9c79310 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/gmp-mparam.h @@ -0,0 +1,257 @@ +/* Pentium 4-64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* These routines exists for all x86_64 chips, but they are slower on Pentium4 + than separate add/sub and shift. Make sure they are not really used. */ +#undef HAVE_NATIVE_mpn_rsblsh1_n +#undef HAVE_NATIVE_mpn_rsblsh2_n +#undef HAVE_NATIVE_mpn_addlsh_n +#undef HAVE_NATIVE_mpn_rsblsh_n + +/* 3400 MHz Pentium4 Nocona / 1024 Kibyte L2 cache */ +/* FFT tuning limit = 107,095,964 */ +/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 12 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 228 + +#define MUL_TOOM22_THRESHOLD 12 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 173 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 89 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 112 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 113 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 238 +#define SQR_TOOM8_THRESHOLD 430 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 236 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 236, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 10, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \ + { 9, 7}, { 21, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23, 8}, { 47, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 51,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 83,10}, \ + { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255,10}, { 71, 9}, \ + { 143, 8}, { 287,10}, { 79,11}, { 47,10}, \ + { 95, 9}, { 191,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 175,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 223,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143,10}, \ + { 287,11}, { 159,10}, { 319,11}, { 175,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 223,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 351,12}, \ + { 191,11}, { 383,12}, { 223,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 511,12}, { 287,11}, \ + { 575,10}, { 1151,12}, { 351,13}, { 191,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,14}, \ + { 127,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,10}, { 2175,12}, { 575,11}, \ + { 1151,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \ + { 447,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,10}, \ + { 4863,13}, { 639,12}, { 1279,11}, { 2559,12}, \ + { 1343,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \ + { 2431,11}, { 4863,14}, { 639,13}, { 1279,12}, \ + { 2559,13}, { 1343,12}, { 2687,13}, { 1407,12}, \ + { 2815,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,12}, { 3583,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4351,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3199,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4351,15}, { 2303,14}, { 4863,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 229 +#define MUL_FFT_THRESHOLD 2752 + +#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 240, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \ + { 11, 7}, { 24, 8}, { 13, 9}, { 7, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 47,11}, { 15,10}, \ + { 31, 9}, { 63,10}, { 39, 9}, { 79,10}, \ + { 55,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 9}, { 143, 8}, { 287,10}, \ + { 79,11}, { 47,10}, { 95, 9}, { 191,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 287,11}, { 159,10}, { 319,11}, \ + { 175,10}, { 351,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,10}, { 415,11}, { 223,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,12}, { 287,11}, { 575,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1087,13}, { 575,12}, { 1151,11}, { 2303,12}, \ + { 1215,11}, { 2431,10}, { 4863,13}, { 639,12}, \ + { 1279,11}, { 2559,12}, { 1343,11}, { 2687,14}, \ + { 383,13}, { 767,12}, { 1535,13}, { 831,12}, \ + { 1663,15}, { 255,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,11}, { 4863,14}, \ + { 639,13}, { 1279,12}, { 2559,13}, { 1343,12}, \ + { 2687,13}, { 1407,12}, { 2815,13}, { 1471,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1791,12}, \ + { 3583,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2303,12}, \ + { 4607,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \ + { 1663,13}, { 3327,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4351,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \ + { 5631,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3327,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3071,14}, { 6143,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 235 +#define SQR_FFT_THRESHOLD 2368 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 45 +#define MULLO_MUL_N_THRESHOLD 5397 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 46 +#define SQRLO_SQR_THRESHOLD 4658 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 95 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 47 + +#define INV_MULMOD_BNM1_THRESHOLD 22 +#define INV_NEWTON_THRESHOLD 178 +#define INV_APPR_THRESHOLD 116 + +#define BINV_NEWTON_THRESHOLD 206 +#define REDC_1_TO_REDC_2_THRESHOLD 24 +#define REDC_2_TO_REDC_N_THRESHOLD 50 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 97 +#define MU_BDIV_QR_THRESHOLD 762 +#define MU_BDIV_Q_THRESHOLD 942 + +#define POWM_SEC_TABLE 7,34,114,523,1486 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 25 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1659 + +#define FAC_DSC_THRESHOLD 969 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 29 +#define HGCD2_DIV1_METHOD 3 /* 2.03% faster than 5 */ +#define HGCD_THRESHOLD 92 +#define HGCD_APPR_THRESHOLD 95 +#define HGCD_REDUCE_THRESHOLD 1815 +#define GCD_DC_THRESHOLD 195 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 4 /* 17.06% faster than 1 */ + +/* Tuneup completed successfully, took 297016 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm b/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm new file mode 100644 index 0000000..4037be4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for Pentium 4. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm new file mode 100644 index 0000000..52856c1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for Pentium 4. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm new file mode 100644 index 0000000..f34b3f0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm @@ -0,0 +1,167 @@ +dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. + +dnl Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 1.0 +C AMD K10 1.12 +C Intel P4 3.25 +C Intel core2 1.5 +C Intel corei 1.5 +C Intel atom 2.5 +C VIA nano 1.75 + + +C INPUT PARAMETERS +define(`ap', %rdi) +define(`n', %rsi) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Review feed-in and wind-down code. In particular, try to avoid adc and +C sbb to placate Pentium4. +C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling, +C without the dual loop exits. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + FUNC_ENTRY(2) + + mov $0x0000FFFFFFFFFFFF, %r11 + + sub $2, %rsi + ja L(gt2) + + mov (ap), %rax + nop + jb L(1) + + mov 8(ap), %rsi + mov %rax, %rdx + shr $48, %rax C src[0] low + + and %r11, %rdx C src[0] high + add %rdx, %rax + mov R32(%rsi), R32(%rdx) + + shr $32, %rsi C src[1] high + add %rsi, %rax + + shl $16, %rdx C src[1] low + add %rdx, %rax + +L(1): FUNC_EXIT() + ret + + + ALIGN(16) +L(gt2): xor R32(%rax), R32(%rax) + xor R32(%rcx), R32(%rcx) + xor R32(%rdx), R32(%rdx) + xor %r8, %r8 + xor %r9, %r9 + xor %r10, %r10 + +L(top): add (ap), %rax + adc $0, %r10 + add 8(ap), %rcx + adc $0, %r8 + add 16(ap), %rdx + adc $0, %r9 + + sub $3, %rsi + jng L(end) + + add 24(ap), %rax + adc $0, %r10 + add 32(ap), %rcx + adc $0, %r8 + add 40(ap), %rdx + lea 48(ap), ap + adc $0, %r9 + + sub $3, %rsi + jg L(top) + + + add $-24, ap +L(end): add %r9, %rax + adc %r10, %rcx + adc %r8, %rdx + + inc %rsi + mov $0x1, R32(%r10) + js L(combine) + + mov $0x10000, R32(%r10) + adc 24(ap), %rax + dec %rsi + js L(combine) + + adc 32(ap), %rcx + mov $0x100000000, %r10 + +L(combine): + sbb %rsi, %rsi C carry + mov %rax, %rdi C 0mod3 + shr $48, %rax C 0mod3 high + + and %r10, %rsi C carry masked + and %r11, %rdi C 0mod3 low + mov R32(%rcx), R32(%r10) C 1mod3 + + add %rsi, %rax C apply carry + shr $32, %rcx C 1mod3 high + + add %rdi, %rax C apply 0mod3 low + movzwl %dx, R32(%rdi) C 2mod3 + shl $16, %r10 C 1mod3 low + + add %rcx, %rax C apply 1mod3 high + shr $16, %rdx C 2mod3 high + + add %r10, %rax C apply 1mod3 low + shl $32, %rdi C 2mod3 low + + add %rdx, %rax C apply 2mod3 high + add %rdi, %rax C apply 2mod3 low + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm new file mode 100644 index 0000000..70de670 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/bd1/mul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm new file mode 100644 index 0000000..a0f7302 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_2.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_2 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_2) +include_mpn(`x86_64/bd1/mul_2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm new file mode 100644 index 0000000..fb16029 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mul_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86_64/core2/mul_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm new file mode 100644 index 0000000..b9e08a8 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mullo_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mullo_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mullo_basecase) +include_mpn(`x86_64/core2/mullo_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm b/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm new file mode 100644 index 0000000..7014b39 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/popcount.asm @@ -0,0 +1,35 @@ +dnl x86-64 mpn_popcount optimized for Pentium 4. + +dnl Copyright 2007 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm new file mode 100644 index 0000000..00e380d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/redc_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_redc_1) +include_mpn(`x86_64/bt1/redc_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm new file mode 100644 index 0000000..5528ce4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/rsh1aors_n.asm @@ -0,0 +1,334 @@ +dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 4.13 +C AMD K10 4.13 +C Intel P4 5.70 +C Intel core2 4.75 +C Intel corei 5 +C Intel atom 8.75 +C VIA nano 5.25 + +C TODO +C * Try to make this smaller, 746 bytes seem excessive for this 2nd class +C function. Less sw pipelining would help, and since we now probably +C pipeline somewhat too deeply, it might not affect performance too much. +C * A separate small-n loop might speed things as well as make things smaller. +C That loop should be selected before pushing registers. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(func, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(func, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ASM_START() + TEXT +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +IFDOS(` jmp L(ent) ') +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +L(ent): push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + mov (vp), %r9 + mov (up), %r15 + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jne L(n00) + + mov R32(%r8), R32(%rbx) C n = 0, 4, 8, ... + mov 8(up), %r10 + ADDSUB %r9, %r15 + mov 8(vp), %r9 + setc R8(%rax) + ADDSUB %rbx, %r15 C return bit + jnc 1f + mov $1, R8(%rax) +1: mov 16(up), %r12 + ADDSUB %r9, %r10 + mov 16(vp), %r9 + setc R8(%rbx) + mov %r15, %r13 + ADDSUB %rax, %r10 + jnc 1f + mov $1, R8(%rbx) +1: mov 24(up), %r11 + ADDSUB %r9, %r12 + lea 32(up), up + mov 24(vp), %r9 + lea 32(vp), vp + setc R8(%rax) + mov %r10, %r14 + shl $63, %r10 + shr %r13 + jmp L(L00) + +L(n00): cmp $2, R32(%rax) + jnc L(n01) + xor R32(%rbx), R32(%rbx) C n = 1, 5, 9, ... + lea -24(rp), rp + mov R32(%r8), R32(%rax) + dec n + jnz L(gt1) + ADDSUB %r9, %r15 + setc R8(%rbx) + ADDSUB %rax, %r15 + jnc 1f + mov $1, R8(%rbx) +1: mov %r15, %r14 + shl $63, %rbx + shr %r14 + jmp L(cj1) +L(gt1): mov 8(up), %r8 + ADDSUB %r9, %r15 + mov 8(vp), %r9 + setc R8(%rbx) + ADDSUB %rax, %r15 + jnc 1f + mov $1, R8(%rbx) +1: mov 16(up), %r10 + ADDSUB %r9, %r8 + mov 16(vp), %r9 + setc R8(%rax) + mov %r15, %r14 + ADDSUB %rbx, %r8 + jnc 1f + mov $1, R8(%rax) +1: mov 24(up), %r12 + ADDSUB %r9, %r10 + mov 24(vp), %r9 + setc R8(%rbx) + mov %r8, %r13 + shl $63, %r8 + shr %r14 + lea 8(up), up + lea 8(vp), vp + jmp L(L01) + +L(n01): jne L(n10) + lea -16(rp), rp C n = 2, 6, 10, ... + mov R32(%r8), R32(%rbx) + mov 8(up), %r11 + ADDSUB %r9, %r15 + mov 8(vp), %r9 + setc R8(%rax) + ADDSUB %rbx, %r15 + jnc 1f + mov $1, R8(%rax) +1: sub $2, n + jnz L(gt2) + ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r15, %r13 + ADDSUB %rax, %r11 + jnc 1f + mov $1, R8(%rbx) +1: mov %r11, %r14 + shl $63, %r11 + shr %r13 + jmp L(cj2) +L(gt2): mov 16(up), %r8 + ADDSUB %r9, %r11 + mov 16(vp), %r9 + setc R8(%rbx) + mov %r15, %r13 + ADDSUB %rax, %r11 + jnc 1f + mov $1, R8(%rbx) +1: mov 24(up), %r10 + ADDSUB %r9, %r8 + mov 24(vp), %r9 + setc R8(%rax) + mov %r11, %r14 + shl $63, %r11 + shr %r13 + lea 16(up), up + lea 16(vp), vp + jmp L(L10) + +L(n10): xor R32(%rbx), R32(%rbx) C n = 3, 7, 11, ... + lea -8(rp), rp + mov R32(%r8), R32(%rax) + mov 8(up), %r12 + ADDSUB %r9, %r15 + mov 8(vp), %r9 + setc R8(%rbx) + ADDSUB %rax, %r15 + jnc 1f + mov $1, R8(%rbx) +1: mov 16(up), %r11 + ADDSUB %r9, %r12 + mov 16(vp), %r9 + setc R8(%rax) + mov %r15, %r14 + ADDSUB %rbx, %r12 + jnc 1f + mov $1, R8(%rax) +1: sub $3, n + jnz L(gt3) + ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r12, %r13 + shl $63, %r12 + shr %r14 + jmp L(cj3) +L(gt3): mov 24(up), %r8 + ADDSUB %r9, %r11 + mov 24(vp), %r9 + setc R8(%rbx) + mov %r12, %r13 + shl $63, %r12 + shr %r14 + lea 24(up), up + lea 24(vp), vp + jmp L(L11) + +L(c0): mov $1, R8(%rbx) + jmp L(rc0) +L(c1): mov $1, R8(%rax) + jmp L(rc1) +L(c2): mov $1, R8(%rbx) + jmp L(rc2) + + ALIGN(16) +L(top): mov (up), %r8 C not on critical path + or %r13, %r10 + ADDSUB %r9, %r11 C not on critical path + mov (vp), %r9 C not on critical path + setc R8(%rbx) C save carry out + mov %r12, %r13 C new for later + shl $63, %r12 C shift new right + shr %r14 C shift old left + mov %r10, (rp) +L(L11): ADDSUB %rax, %r11 C apply previous carry out + jc L(c0) C jump if ripple +L(rc0): mov 8(up), %r10 + or %r14, %r12 + ADDSUB %r9, %r8 + mov 8(vp), %r9 + setc R8(%rax) + mov %r11, %r14 + shl $63, %r11 + shr %r13 + mov %r12, 8(rp) +L(L10): ADDSUB %rbx, %r8 + jc L(c1) +L(rc1): mov 16(up), %r12 + or %r13, %r11 + ADDSUB %r9, %r10 + mov 16(vp), %r9 + setc R8(%rbx) + mov %r8, %r13 + shl $63, %r8 + shr %r14 + mov %r11, 16(rp) +L(L01): ADDSUB %rax, %r10 + jc L(c2) +L(rc2): mov 24(up), %r11 + or %r14, %r8 + ADDSUB %r9, %r12 + lea 32(up), up + mov 24(vp), %r9 + lea 32(vp), vp + setc R8(%rax) + mov %r10, %r14 + shl $63, %r10 + shr %r13 + mov %r8, 24(rp) + lea 32(rp), rp +L(L00): ADDSUB %rbx, %r12 + jc L(c3) +L(rc3): sub $4, n + ja L(top) + +L(end): or %r13, %r10 + ADDSUB %r9, %r11 + setc R8(%rbx) + mov %r12, %r13 + shl $63, %r12 + shr %r14 + mov %r10, (rp) +L(cj3): ADDSUB %rax, %r11 + jnc 1f + mov $1, R8(%rbx) +1: or %r14, %r12 + mov %r11, %r14 + shl $63, %r11 + shr %r13 + mov %r12, 8(rp) +L(cj2): or %r13, %r11 + shl $63, %rbx + shr %r14 + mov %r11, 16(rp) +L(cj1): or %r14, %rbx + mov %rbx, 24(rp) + + mov R32(%r15), R32(%rax) + and $1, R32(%rax) + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret +L(c3): mov $1, R8(%rax) + jmp L(rc3) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm b/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm new file mode 100644 index 0000000..b7c1ee2 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/rshift.asm @@ -0,0 +1,169 @@ +dnl x86-64 mpn_rshift optimized for Pentium 4. + +dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.5 +C AMD K10 ? +C Intel P4 3.29 +C Intel core2 2.1 (fluctuates, presumably cache related) +C Intel corei ? +C Intel atom 14.3 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') +define(`cnt',`%cl') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + mov (up), %rax + movd R32(%rcx), %mm4 + neg R32(%rcx) C put lsh count in cl + and $63, R32(%rcx) + movd R32(%rcx), %mm5 + + lea -8(up,n,8), up + lea -8(rp,n,8), rp + lea 1(n), R32(%r8) + neg n + + shl R8(%rcx), %rax C function return value + + and $3, R32(%r8) + je L(rol) C jump for n = 3, 7, 11, ... + + dec R32(%r8) + jne L(1) +C n = 4, 8, 12, ... + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq %mm2, 8(rp,n,8) + inc n + jmp L(rol) + +L(1): dec R32(%r8) + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq %mm2, 8(rp,n,8) + inc n +L(1x): + cmp $-1, n + je L(ast) + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm3 + psrlq %mm4, %mm3 + movq 16(up,n,8), %mm0 + movq 24(up,n,8), %mm1 + psllq %mm5, %mm0 + por %mm0, %mm2 + psllq %mm5, %mm1 + por %mm1, %mm3 + movq %mm2, 8(rp,n,8) + movq %mm3, 16(rp,n,8) + add $2, n + +L(rol): movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm3 + psrlq %mm4, %mm3 + + add $4, n C 4 + jb L(end) C 2 + ALIGN(32) +L(top): + C finish stuff from lsh block + movq -16(up,n,8), %mm0 + movq -8(up,n,8), %mm1 + psllq %mm5, %mm0 + por %mm0, %mm2 + psllq %mm5, %mm1 + movq (up,n,8), %mm0 + por %mm1, %mm3 + movq 8(up,n,8), %mm1 + movq %mm2, -24(rp,n,8) + movq %mm3, -16(rp,n,8) + C start two new rsh + psllq %mm5, %mm0 + psllq %mm5, %mm1 + + C finish stuff from rsh block + movq -8(up,n,8), %mm2 + movq (up,n,8), %mm3 + psrlq %mm4, %mm2 + por %mm2, %mm0 + psrlq %mm4, %mm3 + movq 8(up,n,8), %mm2 + por %mm3, %mm1 + movq 16(up,n,8), %mm3 + movq %mm0, -8(rp,n,8) + movq %mm1, (rp,n,8) + C start two new lsh + add $4, n + psrlq %mm4, %mm2 + psrlq %mm4, %mm3 + + jae L(top) C 2 +L(end): + movq -8(up), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq (up), %mm1 + psllq %mm5, %mm1 + por %mm1, %mm3 + movq %mm2, -16(rp) + movq %mm3, -8(rp) + +L(ast): movq (up), %mm2 + psrlq %mm4, %mm2 + movq %mm2, (rp) + emms + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm new file mode 100644 index 0000000..9725287 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/sqr_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86_64/core2/sqr_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/popham.asm b/gmp-6.3.0/mpn/x86_64/popham.asm new file mode 100644 index 0000000..3a29b2e --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/popham.asm @@ -0,0 +1,163 @@ +dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance. + +dnl Copyright 2004, 2005, 2007, 2010-2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C popcount hamdist +C cycles/limb cycles/limb +C AMD K8,K9 6 7 +C AMD K10 6 7 +C Intel P4 12 14.3 +C Intel core2 7 8 +C Intel corei ? 7.3 +C Intel atom 16.5 17.5 +C VIA nano 8.75 10.4 + +C TODO +C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for +C hamdist for K8/K9. + + +ifdef(`OPERATION_popcount',` + define(`func',`mpn_popcount') + define(`up', `%rdi') + define(`n', `%rsi') + define(`h55555555', `%r10') + define(`h33333333', `%r11') + define(`h0f0f0f0f', `%rcx') + define(`h01010101', `%rdx') + define(`POP', `$1') + define(`HAM', `dnl') +') +ifdef(`OPERATION_hamdist',` + define(`func',`mpn_hamdist') + define(`up', `%rdi') + define(`vp', `%rsi') + define(`n', `%rdx') + define(`h55555555', `%r10') + define(`h33333333', `%r11') + define(`h0f0f0f0f', `%rcx') + define(`h01010101', `%r12') + define(`POP', `dnl') + define(`HAM', `$1') +') + + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + POP(` FUNC_ENTRY(2) ') + HAM(` FUNC_ENTRY(3) ') + push %rbx + mov $0x5555555555555555, h55555555 + push %rbp + mov $0x3333333333333333, h33333333 + HAM(` push %r12 ') + lea (up,n,8), up + mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f + HAM(` lea (vp,n,8), vp ') + neg n + mov $0x0101010101010101, h01010101 + xor R32(%rax), R32(%rax) + test $1, R8(n) + jz L(top) + + mov (up,n,8), %r8 + HAM(` xor (vp,n,8), %r8 ') + + mov %r8, %r9 + shr %r8 + and h55555555, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and h33333333, %r8 + and h33333333, %r9 + add %r8, %r9 C 16 4-bit fields (0..4) + + dec n + jmp L(mid) + + ALIGN(16) +L(top): mov (up,n,8), %r8 + mov 8(up,n,8), %rbx + HAM(` xor (vp,n,8), %r8 ') + HAM(` xor 8(vp,n,8), %rbx ') + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and h55555555, %r8 + and h55555555, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and h33333333, %r8 + and h33333333, %r9 + and h33333333, %rbx + and h33333333, %rbp + add %r8, %r9 C 16 4-bit fields (0..4) + add %rbx, %rbp C 16 4-bit fields (0..4) + + add %rbp, %r9 C 16 4-bit fields (0..8) +L(mid): mov %r9, %r8 + shr $4, %r9 + and h0f0f0f0f, %r8 + and h0f0f0f0f, %r9 + add %r8, %r9 C 8 8-bit fields (0..16) + + imul h01010101, %r9 C sum the 8 fields in high 8 bits + shr $56, %r9 + + add %r9, %rax C add to total + add $2, n + jnc L(top) + +L(end): + HAM(` pop %r12 ') + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm b/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm new file mode 100644 index 0000000..a3e9cc5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/rsh1aors_n.asm @@ -0,0 +1,189 @@ +dnl AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1 +dnl AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1 + +dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.14 (mpn_add_n + mpn_rshift need 4.125) +C AMD K10 2.14 (mpn_add_n + mpn_rshift need 4.125) +C Intel P4 12.75 +C Intel core2 3.75 +C Intel NMH 4.4 +C Intel SBR ? +C Intel atom ? +C VIA nano 3.25 + +C TODO +C * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n',` %rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + + xor R32(%rax), R32(%rax) + neg %r8 C set C flag from parameter + mov (up), %rbx + ADCSBB (vp), %rbx + jmp L(ent) +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + + xor R32(%rax), R32(%rax) + mov (up), %rbx + ADDSUB (vp), %rbx +L(ent): + rcr %rbx C rotate, save acy + adc R32(%rax), R32(%rax) C return value + + mov R32(n), R32(%r11) + and $3, R32(%r11) + + cmp $1, R32(%r11) + je L(do) C jump if n = 1 5 9 ... + +L(n1): cmp $2, R32(%r11) + jne L(n2) C jump unless n = 2 6 10 ... + add %rbx, %rbx C rotate carry limb, restore acy + mov 8(up), %r10 + ADCSBB 8(vp), %r10 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + rcr %r10 + rcr %rbx + mov %rbx, -8(rp) + jmp L(cj1) + +L(n2): cmp $3, R32(%r11) + jne L(n3) C jump unless n = 3 7 11 ... + add %rbx, %rbx C rotate carry limb, restore acy + mov 8(up), %r9 + mov 16(up), %r10 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(rp) + jmp L(cj2) + +L(n3): dec n C come here for n = 4 8 12 ... + add %rbx, %rbx C rotate carry limb, restore acy + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + ADCSBB 24(vp), %r10 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(rp) + mov %r8, -16(rp) +L(cj2): mov %r9, -8(rp) +L(cj1): mov %r10, %rbx + +L(do): + shr $2, n C 4 + je L(end) C 2 + ALIGN(16) +L(top): add %rbx, %rbx C rotate carry limb, restore acy + + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB 8(vp), %r8 + ADCSBB 16(vp), %r9 + mov 24(up), %r10 + mov 32(up), %r11 + ADCSBB 24(vp), %r10 + ADCSBB 32(vp), %r11 + + lea 32(up), up + lea 32(vp), vp + + rcr %r11 C rotate, save acy + rcr %r10 + rcr %r9 + rcr %r8 + + rcr %rbx + mov %rbx, (rp) + mov %r8, 8(rp) + mov %r9, 16(rp) + mov %r10, 24(rp) + mov %r11, %rbx + + lea 32(rp), rp + dec n + jne L(top) + +L(end): mov %rbx, (rp) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/rshift.asm b/gmp-6.3.0/mpn/x86_64/rshift.asm new file mode 100644 index 0000000..3f344f1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/rshift.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_rshift -- mpn right shift. + +dnl Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.375 +C AMD K10 2.375 +C Intel P4 8 +C Intel core2 2.11 +C Intel corei ? +C Intel atom 5.75 +C VIA nano 3.5 + + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + neg R32(%rcx) C put rsh count in cl + mov (up), %rax + shl R8(%rcx), %rax C function return value + neg R32(%rcx) C put lsh count in cl + + lea 1(n), R32(%r8) + + lea -8(up,n,8), up + lea -8(rp,n,8), rp + neg n + + and $3, R32(%r8) + je L(rlx) C jump for n = 3, 7, 11, ... + + dec R32(%r8) + jne L(1) +C n = 4, 8, 12, ... + mov 8(up,n,8), %r10 + shr R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + shl R8(%rcx), %r8 + or %r8, %r10 + mov %r10, 8(rp,n,8) + inc n + jmp L(rll) + +L(1): dec R32(%r8) + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + mov 8(up,n,8), %r10 + shr R8(%rcx), %r10 + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + shl R8(%rcx), %r8 + or %r8, %r10 + mov %r10, 8(rp,n,8) + inc n + neg R32(%rcx) C put lsh count in cl +L(1x): + cmp $-1, n + je L(ast) + mov 8(up,n,8), %r10 + shr R8(%rcx), %r10 + mov 16(up,n,8), %r11 + shr R8(%rcx), %r11 + neg R32(%rcx) C put rsh count in cl + mov 16(up,n,8), %r8 + mov 24(up,n,8), %r9 + shl R8(%rcx), %r8 + or %r8, %r10 + shl R8(%rcx), %r9 + or %r9, %r11 + mov %r10, 8(rp,n,8) + mov %r11, 16(rp,n,8) + add $2, n + +L(rll): neg R32(%rcx) C put lsh count in cl +L(rlx): mov 8(up,n,8), %r10 + shr R8(%rcx), %r10 + mov 16(up,n,8), %r11 + shr R8(%rcx), %r11 + + add $4, n C 4 + jb L(end) C 2 + ALIGN(16) +L(top): + C finish stuff from lsh block + neg R32(%rcx) C put rsh count in cl + mov -16(up,n,8), %r8 + mov -8(up,n,8), %r9 + shl R8(%rcx), %r8 + or %r8, %r10 + shl R8(%rcx), %r9 + or %r9, %r11 + mov %r10, -24(rp,n,8) + mov %r11, -16(rp,n,8) + C start two new rsh + mov (up,n,8), %r8 + mov 8(up,n,8), %r9 + shl R8(%rcx), %r8 + shl R8(%rcx), %r9 + + C finish stuff from rsh block + neg R32(%rcx) C put lsh count in cl + mov -8(up,n,8), %r10 + mov 0(up,n,8), %r11 + shr R8(%rcx), %r10 + or %r10, %r8 + shr R8(%rcx), %r11 + or %r11, %r9 + mov %r8, -8(rp,n,8) + mov %r9, 0(rp,n,8) + C start two new lsh + mov 8(up,n,8), %r10 + mov 16(up,n,8), %r11 + shr R8(%rcx), %r10 + shr R8(%rcx), %r11 + + add $4, n + jae L(top) C 2 +L(end): + neg R32(%rcx) C put rsh count in cl + mov -8(up), %r8 + shl R8(%rcx), %r8 + or %r8, %r10 + mov (up), %r9 + shl R8(%rcx), %r9 + or %r9, %r11 + mov %r10, -16(rp) + mov %r11, -8(rp) + + neg R32(%rcx) C put lsh count in cl +L(ast): mov (up), %r10 + shr R8(%rcx), %r10 + mov %r10, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm new file mode 100644 index 0000000..e8aed26 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/sec_tabselect.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb good for cpu +C AMD K8,K9 1.5 Y +C AMD K10 1.4 +C AMD bd1 2.64 +C AMD bobcat 2.15 Y +C Intel P4 4 +C Intel core2 1.38 +C Intel NHM 1.75 +C Intel SBR 1.25 +C Intel atom 2.5 Y +C VIA nano 1.75 Y + +C NOTES +C * This has not been tuned for any specific processor. Its speed should not +C be too bad, though. +C * Using SSE2/AVX2 could result in many-fold speedup. +C * WORKS FOR n mod 4 = 0 ONLY! + +C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which) +define(`rp', `%rdi') +define(`tp', `%rsi') +define(`n', `%rdx') +define(`nents', `%rcx') +define(`which', `%r8') + +define(`i', `%rbp') +define(`j', `%r9') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C nents n rp tab i which j * * * * * * + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sec_tabselect) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov n, j + add $-4, j + js L(outer_end) + +L(outer_top): + mov nents, i + push tp + xor R32(%r12), R32(%r12) + xor R32(%r13), R32(%r13) + xor R32(%r14), R32(%r14) + xor R32(%r15), R32(%r15) + mov which, %rbx + + ALIGN(16) +L(top): sub $1, %rbx + sbb %rax, %rax + mov 0(tp), %r10 + mov 8(tp), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + mov 16(tp), %r10 + mov 24(tp), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r14 + or %r11, %r15 + lea (tp,n,8), tp + add $-1, i + jne L(top) + + mov %r12, 0(rp) + mov %r13, 8(rp) + mov %r14, 16(rp) + mov %r15, 24(rp) + pop tp + lea 32(tp), tp + lea 32(rp), rp + add $-4, j + jns L(outer_top) +L(outer_end): + + test $2, R8(n) + jz L(b0x) +L(b1x): mov nents, i + push tp + xor R32(%r12), R32(%r12) + xor R32(%r13), R32(%r13) + mov which, %rbx + ALIGN(16) +L(tp2): sub $1, %rbx + sbb %rax, %rax + mov 0(tp), %r10 + mov 8(tp), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + lea (tp,n,8), tp + add $-1, i + jne L(tp2) + mov %r12, 0(rp) + mov %r13, 8(rp) + pop tp + lea 16(tp), tp + lea 16(rp), rp + +L(b0x): test $1, R8(n) + jz L(b00) +L(b01): mov nents, i + xor R32(%r12), R32(%r12) + mov which, %rbx + ALIGN(16) +L(tp1): sub $1, %rbx + sbb %rax, %rax + mov 0(tp), %r10 + and %rax, %r10 + or %r10, %r12 + lea (tp,n,8), tp + add $-1, i + jne L(tp1) + mov %r12, 0(rp) + +L(b00): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm new file mode 100644 index 0000000..98c26cf --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh1_n.asm @@ -0,0 +1,50 @@ +dnl X86-64 mpn_addlsh1_n/mpn_rsblsh1_n optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm new file mode 100644 index 0000000..2a83217 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorrlsh2_n.asm @@ -0,0 +1,50 @@ +dnl X86-64 mpn_addlsh2_n/mpn_rsblsh2_n optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh2_n)') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm new file mode 100644 index 0000000..dce3d75 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/aors_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreisbr/aors_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm new file mode 100644 index 0000000..ead0d76 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/aorsmul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_1/mpn_submul_1 optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) +include_mpn(`x86_64/core2/aorsmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h new file mode 100644 index 0000000..f8cb0f4 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/gmp-mparam.h @@ -0,0 +1,252 @@ +/* Intel Silvermont gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */ +/* FFT tuning limit = 468153400 */ +/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 55 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 66 +#define MUL_TOOM44_THRESHOLD 152 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 232 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 24 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \ + { 11, 7}, { 23, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135,11}, { 79, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 575,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407,13}, { 191,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 543,11}, { 1087,10}, { 2175,12}, \ + { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \ + { 1407,12}, { 2815,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2559,14}, { 1407,13}, { 2943,12}, { 5887,15}, \ + { 767,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \ + { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4607,15}, { 9215,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 225 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 17, 8}, \ + { 9, 7}, { 21, 8}, { 11, 7}, { 23, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 95,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511, 9}, { 271, 8}, { 543,11}, \ + { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 479,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 479,13}, { 255,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,12}, { 2943,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,15}, \ + { 767,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,13}, { 3583,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,17}, { 2047,16}, { 4607,15}, \ + { 9983,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 232 +#define SQR_FFT_THRESHOLD 2752 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 55 +#define MULLO_MUL_N_THRESHOLD 6633 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 5397 + +#define DC_DIV_QR_THRESHOLD 33 +#define DC_DIVAPPR_Q_THRESHOLD 222 +#define DC_BDIV_QR_THRESHOLD 31 +#define DC_BDIV_Q_THRESHOLD 147 + +#define INV_MULMOD_BNM1_THRESHOLD 37 +#define INV_NEWTON_THRESHOLD 222 +#define INV_APPR_THRESHOLD 222 + +#define BINV_NEWTON_THRESHOLD 212 +#define REDC_1_TO_REDC_2_THRESHOLD 55 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 81 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1043 + +#define POWM_SEC_TABLE 1,34,102,588,1730 + +#define GET_STR_DC_THRESHOLD 17 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1659 + +#define FAC_DSC_THRESHOLD 351 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 3 /* 3.06% faster than 1 */ +#define HGCD_THRESHOLD 120 +#define HGCD_APPR_THRESHOLD 153 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 309 +#define JACOBI_BASE_METHOD 1 /* 2.28% faster than 3 */ + +/* Tuneup completed successfully, took 938046 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm b/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm new file mode 100644 index 0000000..848ed01 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/hamdist.asm @@ -0,0 +1,38 @@ +dnl x86-64 mpn_hamdist. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/coreinhm/hamdist.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm b/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm new file mode 100644 index 0000000..acd3180 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm new file mode 100644 index 0000000..3a68bb5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm new file mode 100644 index 0000000..c1e1c94 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/mul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/bd1/mul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm new file mode 100644 index 0000000..6228c48 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/mul_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86_64/k8/mul_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm new file mode 100644 index 0000000..0244f8a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/mullo_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mullo_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mullo_basecase) +include_mpn(`x86_64/k8/mullo_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm b/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm new file mode 100644 index 0000000..73eb7b5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/popcount.asm @@ -0,0 +1,38 @@ +dnl x86-64 mpn_popcount. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/coreinhm/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm b/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm new file mode 100644 index 0000000..b84371c --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm new file mode 100644 index 0000000..afccf93 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/silvermont/sqr_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86_64/k8/sqr_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h new file mode 100644 index 0000000..a899ea1 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/skylake/gmp-mparam.h @@ -0,0 +1,246 @@ +/* Skylake gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */ +/* FFT tuning limit = 465,990,371 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 32 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 41 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 473 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 208 +#define MUL_TOOM6H_THRESHOLD 300 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 151 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 426 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 46 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39, 8}, { 79, 9}, { 43,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 167,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 199,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 671,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 543,11}, { 1087,12}, { 607,13}, { 319,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 959,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1151,13}, { 639,12}, \ + { 1343,13}, { 703,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 959,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1407,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2175,13}, { 4351,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8703,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \ + { 15359,15}, { 7935,17}, { 2047,16}, { 4095,15}, \ + { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 227 +#define MUL_FFT_THRESHOLD 6272 + +#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 400, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 28, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,10}, { 383,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,10}, { 607,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 479,13}, { 127,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 607,12}, { 319,11}, { 671,12}, \ + { 351,11}, { 735,12}, { 383,11}, { 799,12}, \ + { 415,11}, { 831,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 639,12}, { 1279,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 895,12}, \ + { 1791,13}, { 959,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1151,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1407,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1663,14}, { 895,13}, { 1791,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 1279,13}, { 2687,14}, \ + { 1407,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 1535,14}, { 3455,15}, \ + { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5119,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3071,14}, { 6143,15}, { 3327,14}, { 6911,15}, \ + { 3839,17}, { 1023,16}, { 2047,15}, { 4863,16}, \ + { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 6911,16}, { 3583,15}, { 7679,14}, { 15359,17}, \ + { 2047,16}, { 4095,15}, { 8191,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 205 +#define SQR_FFT_THRESHOLD 4224 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 79 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 109 +#define SQRLO_SQR_THRESHOLD 8207 + +#define DC_DIV_QR_THRESHOLD 55 +#define DC_DIVAPPR_Q_THRESHOLD 179 +#define DC_BDIV_QR_THRESHOLD 82 +#define DC_BDIV_Q_THRESHOLD 166 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 170 +#define INV_APPR_THRESHOLD 171 + +#define BINV_NEWTON_THRESHOLD 294 +#define REDC_1_TO_REDC_2_THRESHOLD 33 +#define REDC_2_TO_REDC_N_THRESHOLD 59 + +#define MU_DIV_QR_THRESHOLD 1528 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 62 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1597 + +#define POWM_SEC_TABLE 2,8,191,452,904 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 898 +#define SET_STR_PRECOMPUTE_THRESHOLD 1670 + +#define FAC_DSC_THRESHOLD 474 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 5 /* 3.85% faster than 3 */ +#define HGCD_THRESHOLD 64 +#define HGCD_APPR_THRESHOLD 60 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 618 +#define GCDEXT_DC_THRESHOLD 321 +#define JACOBI_BASE_METHOD 1 /* 12.01% faster than 4 */ + +/* Tuneup completed successfully, took 213784 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm b/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..f486125 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/sqr_diag_addlsh1.asm @@ -0,0 +1,116 @@ +dnl AMD64 mpn_sqr_diag_addlsh1 + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.5 +C AMD K10 2.5 +C AMD bull 3.6 +C AMD pile 3.6 +C AMD steam ? +C AMD bobcat 4 +C AMD jaguar ? +C Intel P4 11.5 +C Intel core 4 +C Intel NHM 3.6 +C Intel SBR 3.15 +C Intel IBR 3.0 +C Intel HWL 2.6 +C Intel BWL ? +C Intel atom 14 +C VIA nano 3.5 + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') +define(`tp', `%rsi') +define(`up_arg', `%rdx') +define(`n', `%rcx') + +define(`up', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_sqr_diag_addlsh1) + FUNC_ENTRY(4) + push %rbx + + dec n + shl n + + mov (up_arg), %rax + + lea (rp,n,8), rp + lea (tp,n,8), tp + lea (up_arg,n,4), up + neg n + + mul %rax + mov %rax, (rp,n,8) + + xor R32(%rbx), R32(%rbx) + jmp L(mid) + + ALIGN(16) +L(top): add %r10, %r8 + adc %rax, %r9 + mov %r8, -8(rp,n,8) + mov %r9, (rp,n,8) +L(mid): mov 8(up,n,4), %rax + mov (tp,n,8), %r8 + mov 8(tp,n,8), %r9 + adc %r8, %r8 + adc %r9, %r9 + lea (%rdx,%rbx), %r10 + setc R8(%rbx) + mul %rax + add $2, n + js L(top) + +L(end): add %r10, %r8 + adc %rax, %r9 + mov %r8, I(-8(rp),-8(rp,n,8)) + mov %r9, I((rp),(rp,n,8)) + adc %rbx, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm new file mode 100644 index 0000000..c6d829f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/sublsh1_n.asm @@ -0,0 +1,160 @@ +dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1) + +dnl Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 2.2 +C AMD K10 2.2 +C Intel P4 12.75 +C Intel core2 3.45 +C Intel corei ? +C Intel atom ? +C VIA nano 3.25 + +C Sometimes speed degenerates, supposedly related to that some operand +C alignments cause cache conflicts. + +C The speed is limited by decoding/issue bandwidth. There are 26 instructions +C in the loop, which corresponds to 26/3/4 = 2.167 c/l. + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sublsh1_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (vp), %r8 + mov R32(n), R32(%rax) + lea (rp,n,8), rp + lea (up,n,8), up + lea (vp,n,8), vp + neg n + xor R32(%rbp), R32(%rbp) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): add %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + mov 16(vp,n,8), %r10 + adc %r10, %r10 + sbb R32(%rax), R32(%rax) C save scy + mov (up,n,8), %rbp + mov 8(up,n,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (rp,n,8) + mov %rbx, 8(rp,n,8) + mov 16(up,n,8), %rbp + sbb %r10, %rbp + mov %rbp, 16(rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + add $3, n + jmp L(ent) + +L(b10): add %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + sbb R32(%rax), R32(%rax) C save scy + mov (up,n,8), %rbp + mov 8(up,n,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (rp,n,8) + mov %rbx, 8(rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + add $2, n + jmp L(ent) + +L(b01): add %r8, %r8 + sbb R32(%rax), R32(%rax) C save scy + mov (up,n,8), %rbp + sub %r8, %rbp + mov %rbp, (rp,n,8) + sbb R32(%rbp), R32(%rbp) C save acy + inc n +L(ent): jns L(end) + + ALIGN(16) +L(top): add R32(%rax), R32(%rax) C restore scy + + mov (vp,n,8), %r8 +L(b00): adc %r8, %r8 + mov 8(vp,n,8), %r9 + adc %r9, %r9 + mov 16(vp,n,8), %r10 + adc %r10, %r10 + mov 24(vp,n,8), %r11 + adc %r11, %r11 + + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + + mov (up,n,8), %rbp + mov 8(up,n,8), %rbx + sbb %r8, %rbp + sbb %r9, %rbx + mov %rbp, (rp,n,8) + mov %rbx, 8(rp,n,8) + mov 16(up,n,8), %rbp + mov 24(up,n,8), %rbx + sbb %r10, %rbp + sbb %r11, %rbx + mov %rbp, 16(rp,n,8) + mov %rbx, 24(rp,n,8) + + sbb R32(%rbp), R32(%rbp) C save acy + add $4, n + js L(top) + +L(end): add R32(%rbp), R32(%rax) + neg R32(%rax) + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 b/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 new file mode 100644 index 0000000..4e08f2a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/x86_64-defs.m4 @@ -0,0 +1,493 @@ +divert(-1) + +dnl m4 macros for amd64 assembler. + +dnl Copyright 1999-2005, 2008, 2009, 2011-2013, 2017 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Usage: CPUVEC_FUNCS_LIST +dnl +dnl A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the +dnl order they appear in that structure. + +define(CPUVEC_FUNCS_LIST, +``add_n', +`addlsh1_n', +`addlsh2_n', +`addmul_1', +`addmul_2', +`bdiv_dbm1c', +`cnd_add_n', +`cnd_sub_n', +`com', +`copyd', +`copyi', +`divexact_1', +`divrem_1', +`gcd_11', +`lshift', +`lshiftc', +`mod_1', +`mod_1_1p', +`mod_1_1p_cps', +`mod_1s_2p', +`mod_1s_2p_cps', +`mod_1s_4p', +`mod_1s_4p_cps', +`mod_34lsub1', +`modexact_1c_odd', +`mul_1', +`mul_basecase', +`mullo_basecase', +`preinv_divrem_1', +`preinv_mod_1', +`redc_1', +`redc_2', +`rshift', +`sqr_basecase', +`sub_n', +`sublsh1_n', +`submul_1'') + + +dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) +dnl +dnl In the amd64 code we use explicit TEXT and ALIGN() calls in the code, +dnl since different alignments are wanted in various circumstances. So for +dnl instance, +dnl +dnl TEXT +dnl ALIGN(16) +dnl PROLOGUE(mpn_add_n) +dnl ... +dnl EPILOGUE() + +define(`PROLOGUE_cpu', +m4_assert_numargs(1) +` GLOBL $1 + TYPE($1,`function') + COFF_TYPE($1) +$1: +') + + +dnl Usage: COFF_TYPE(GSYM_PREFIX`'foo) +dnl +dnl Emit COFF style ".def ... .endef" type information for a function, when +dnl supported. The argument should include any GSYM_PREFIX. +dnl +dnl See autoconf macro GMP_ASM_COFF_TYPE for HAVE_COFF_TYPE. + +define(COFF_TYPE, +m4_assert_numargs(1) +m4_assert_defined(`HAVE_COFF_TYPE') +`ifelse(HAVE_COFF_TYPE,yes, + `.def $1 + .scl 2 + .type 32 + .endef')') + + +dnl Usage: ASSERT([cond][,instructions]) +dnl +dnl If WANT_ASSERT is 1, output the given instructions and expect the given +dnl flags condition to then be satisfied. For example, +dnl +dnl ASSERT(ne, `cmpq %rax, %rbx') +dnl +dnl The instructions can be omitted to just assert a flags condition with +dnl no extra calculation. For example, +dnl +dnl ASSERT(nc) +dnl +dnl When `instructions' is not empty, a pushfq/popfq is added for +dnl convenience to preserve the flags, but the instructions themselves must +dnl preserve any registers that matter. +dnl +dnl The condition can be omitted to just output the given instructions when +dnl assertion checking is wanted. In this case the pushf/popf is omitted. +dnl For example, +dnl +dnl ASSERT(, `movq %rax, VAR_KEEPVAL') + +define(ASSERT, +m4_assert_numargs_range(1,2) +m4_assert_defined(`WANT_ASSERT') +`ifelse(WANT_ASSERT,1, +`ifelse(`$1',, +` $2', +`ifelse(`$2',,, +` pushfq') + $2 + `j$1' L(ASSERT_ok`'ASSERT_counter) + ud2 C assertion failed +L(ASSERT_ok`'ASSERT_counter): +ifelse(`$2',,,` popfq') +define(`ASSERT_counter',incr(ASSERT_counter))')')') + +define(ASSERT_counter,1) + +dnl LEA - load effective address +dnl +dnl FIXME: We should never create a GOT entry and therefore use the simpler 2nd +dnl variant always. We need to understand what happens for not-yet-hidden +dnl symbols first. +dnl +define(`LEA',`dnl +ifdef(`PIC', + `mov $1@GOTPCREL(%rip), $2' +, + `lea $1(%rip), $2') +') + + +define(`DEF_OBJECT', +m4_assert_numargs_range(2,3) +` ifelse($#,3,`$3',`RODATA') + ALIGN($2) +$1: +') + +define(`END_OBJECT', +m4_assert_numargs(1) +` SIZE(`$1',.-`$1')') + + +define(`R32', + `ifelse($1,`%rax',`%eax', + $1,`%rbx',`%ebx', + $1,`%rcx',`%ecx', + $1,`%rdx',`%edx', + $1,`%rsi',`%esi', + $1,`%rdi',`%edi', + $1,`%rbp',`%ebp', + $1,`%r8',`%r8d', + $1,`%r9',`%r9d', + $1,`%r10',`%r10d', + $1,`%r11',`%r11d', + $1,`%r12',`%r12d', + $1,`%r13',`%r13d', + $1,`%r14',`%r14d', + $1,`%r15',`%r15d')') +define(`R8', + `ifelse($1,`%rax',`%al', + $1,`%rbx',`%bl', + $1,`%rcx',`%cl', + $1,`%rdx',`%dl', + $1,`%rsi',`%sil', + $1,`%rdi',`%dil', + $1,`%rbp',`%bpl', + $1,`%r8',`%r8b', + $1,`%r9',`%r9b', + $1,`%r10',`%r10b', + $1,`%r11',`%r11b', + $1,`%r12',`%r12b', + $1,`%r13',`%r13b', + $1,`%r14',`%r14b', + $1,`%r15',`%r15b')') + + +dnl Usage: CALL(funcname) +dnl + +define(`CALL',`dnl +ifdef(`PIC', + `call GSYM_PREFIX`'$1@PLT' +, + `call GSYM_PREFIX`'$1' +)') + +define(`TCALL',`dnl +ifdef(`PIC', + `jmp GSYM_PREFIX`'$1@PLT' +, + `jmp GSYM_PREFIX`'$1' +)') + + +define(`JUMPTABSECT', `.section .data.rel.ro.local,"a",@progbits') + + +dnl Usage: JMPENT(targlabel,tablabel) + +define(`JMPENT',`dnl +ifdef(`PIC', + `.long $1-$2'dnl +, + `.quad $1'dnl +)') + + +dnl These macros are defined just for DOS64, where they provide calling +dnl sequence glue code. + +define(`FUNC_ENTRY',`') +define(`FUNC_EXIT',`') + + +dnl Target ABI macros. + +define(`IFDOS', `') +define(`IFSTD', `$1') +define(`IFELF', `$1') + + +dnl Usage: PROTECT(symbol) +dnl +dnl Used for private GMP symbols that should never be overridden by users. +dnl This can save reloc entries and improve shlib sharing as well as +dnl application startup times + +define(`PROTECT', `.hidden $1') + + +dnl Usage: x86_lookup(target, key,value, key,value, ...) +dnl +dnl Look for `target' among the `key' parameters. +dnl +dnl x86_lookup expands to the corresponding `value', or generates an error +dnl if `target' isn't found. + +define(x86_lookup, +m4_assert_numargs_range(1,999) +`ifelse(eval($#<3),1, +`m4_error(`unrecognised part of x86 instruction: $1 +')', +`ifelse(`$1',`$2', `$3', +`x86_lookup(`$1',shift(shift(shift($@))))')')') + + +dnl Usage: x86_opcode_regxmm(reg) +dnl +dnl Validate the given xmm register, and return its number, 0 to 7. + +define(x86_opcode_regxmm, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_regxmm_list)') + +define(x86_opcode_regxmm_list, +``%xmm0',0, +`%xmm1',1, +`%xmm2',2, +`%xmm3',3, +`%xmm4',4, +`%xmm5',5, +`%xmm6',6, +`%xmm7',7, +`%xmm8',8, +`%xmm9',9, +`%xmm10',10, +`%xmm11',11, +`%xmm12',12, +`%xmm13',13, +`%xmm14',14, +`%xmm15',15') + +dnl Usage: palignr($imm,%srcreg,%dstreg) +dnl +dnl Emit a palignr instruction, using a .byte sequence, since obsolete but +dnl still distributed versions of gas don't know SSSE3 instructions. + +define(`palignr', +m4_assert_numargs(3) +`.byte 0x66,dnl +ifelse(eval(x86_opcode_regxmm($3) >= 8 || x86_opcode_regxmm($2) >= 8),1, + `eval(0x40+x86_opcode_regxmm($3)/8*4+x86_opcode_regxmm($2)/8),')dnl +0x0f,0x3a,0x0f,dnl +eval(0xc0+x86_opcode_regxmm($3)%8*8+x86_opcode_regxmm($2)%8),dnl +substr($1,1)') + + +dnl Usage +dnl +dnl regnum(op) raw operand index (so slightly misnamed) +dnl regnumh(op) high bit of register operand nimber +dnl ix(op) 0 for reg operand, 1 for plain pointer operand. +dnl + +define(`regnum',`x86_lookup(`$1',oplist)') +define(`regnumh',`eval(regnum($1)/8 & 1)') +define(`ix',`eval(regnum($1)/16)') +define(`oplist', +``%rax', 0, `%rcx', 1, `%rdx', 2, `%rbx', 3, + `%rsp', 4, `%rbp', 5, `%rsi', 6, `%rdi', 7, + `%r8', 8, `%r9', 9, `%r10', 10, `%r11', 11, + `%r12', 12, `%r13', 13, `%r14', 14, `%r15', 15, + `(%rax)',16, `(%rcx)',17, `(%rdx)',18, `(%rbx)',19, + `(%rsp)',20, `(%rbp)',21, `(%rsi)',22, `(%rdi)',23, + `(%r8)', 24, `(%r9)', 25, `(%r10)',26, `(%r11)',27, + `(%r12)',28, `(%r13)',29, `(%r14)',30, `(%r15)',31') + +dnl Usage (by mulx, shlx, shrx) +dnl +dnl reg1,reg2,reg3,opc1,opc2 +dnl +dnl or +dnl +dnl (reg1),reg2,reg3,opc1,opc2 +dnl +dnl where reg1 is any register but rsp,rbp,r12,r13, or +dnl +dnl or +dnl +dnl off,(reg1),reg2,reg3,opc1,opc2 +dnl +dnl where reg1 is any register but rsp,r12. +dnl +dnl The exceptions are due to special coding needed for some registers; rsp +dnl and r12 need an extra byte 0x24 at the end while rbp and r13 lack the +dnl offset-less form. +dnl +dnl Other addressing forms are not handled. Invalid forms are not properly +dnl detected. Offsets that don't fit one byte are not handled correctly. + +define(`c4_helper',`dnl +.byte 0xc4`'dnl +ifelse(`$#',5,`dnl +,eval(0xe2^32*regnumh($1)^128*regnumh($3))`'dnl +,eval(0x$4-8*regnum($2))`'dnl +,0x$5`'dnl +,eval(0xc0+(7 & regnum($1))+8*(7 & regnum($3))-0xc0*ix($1))`'dnl +',`$#',6,`dnl +,eval(0xe2^32*regnumh($2)^128*regnumh($4))`'dnl +,eval(0x$5-8*regnum($3))`'dnl +,0x$6`'dnl +,eval(0x40+(7 & regnum($2))+8*(7 & regnum($4)))`'dnl +,eval(($1 + 256) % 256)`'dnl +')') + + +dnl Usage +dnl +dnl mulx(reg1,reg2,reg3) +dnl +dnl or +dnl +dnl mulx((reg1),reg2,reg3) +dnl +dnl where reg1 is any register but rsp,rbp,r12,r13, or +dnl +dnl mulx(off,(reg1),reg2,reg3) +dnl +dnl where reg1 is any register but rsp,r12. + +define(`mulx',`dnl +ifelse(`$#',3,`dnl +c4_helper($1,$2,$3,fb,f6)',`dnl format 1,2 +c4_helper($1,$2,$3,$4,fb,f6)'dnl format 3 +)') + + +dnl Usage +dnl +dnl shlx(reg1,reg2,reg3) +dnl shrx(reg1,reg2,reg3) +dnl +dnl or +dnl +dnl shlx(reg1,(reg2),reg3) +dnl shrx(reg1,(reg2),reg3) +dnl +dnl where reg2 is any register but rsp,rbp,r12,r13, or +dnl +dnl shlx(reg1,off,(reg2),reg3) +dnl shrx(reg1,off,(reg2),reg3) +dnl +dnl where reg2 is any register but rsp,r12. + +define(`shlx',`dnl +ifelse(`$#',3,`dnl +c4_helper($2,$1,$3,f9,f7)',`dnl format 1,2 +c4_helper($1,$3,$2,$4,f9,f7)'dnl format 3 +)') + +define(`shrx',`dnl +ifelse(`$#',3,`dnl +c4_helper($2,$1,$3,fb,f7)',`dnl format 1,2 +c4_helper($1,$3,$2,$4,fb,f7)'dnl format 3 +)') + +define(`sarx',`dnl +ifelse(`$#',3,`dnl +c4_helper($2,$1,$3,fa,f7)',`dnl format 1,2 +c4_helper($1,$3,$2,$4,fa,f7)'dnl format 3 +)') + + +dnl Usage +dnl +dnl adcx(reg1,reg2) +dnl adox(reg1,reg2) +dnl +dnl or +dnl +dnl adcx((reg1),reg2) +dnl adox((reg1),reg2) +dnl +dnl where reg1 is any register but rsp,rbp,r12,r13, or +dnl +dnl adcx(off,(reg1),reg2) +dnl adox(off,(reg1),reg2) +dnl +dnl where reg1 is any register but rsp,r12. +dnl +dnl The exceptions are due to special coding needed for some registers; rsp +dnl and r12 need an extra byte 0x24 at the end while rbp and r13 lack the +dnl offset-less form. +dnl +dnl Other addressing forms are not handled. Invalid forms are not properly +dnl detected. Offsets that don't fit one byte are not handled correctly. + +define(`adx_helper',`dnl +,eval(0x48+regnumh($1)+4*regnumh($2))`'dnl +,0x0f`'dnl +,0x38`'dnl +,0xf6`'dnl +') + +define(`adx',`dnl +ifelse(`$#',2,`dnl +adx_helper($1,$2)dnl +,eval(0xc0+(7 & regnum($1))+8*(7 & regnum($2))-0xc0*ix($1))`'dnl +',`$#',3,`dnl +adx_helper($2,$3)dnl +,eval(0x40+(7 & regnum($2))+8*(7 & regnum($3)))`'dnl +,eval(($1 + 256) % 256)`'dnl +')') + +define(`adcx',`dnl +.byte 0x66`'dnl +adx($@)') + +define(`adox',`dnl +.byte 0xf3`'dnl +adx($@)') + +divert`'dnl diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm new file mode 100644 index 0000000..803fa30 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh1_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addlsh1_n, mpn_addlsh1_nc, mpn_rsblsh1_n, mpn_rsblsh1_nc. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/atom/aorrlsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm new file mode 100644 index 0000000..417dd0a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/aorrlsh_n.asm @@ -0,0 +1,227 @@ +dnl AMD64 mpn_addlsh_n, mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 n/a +C AMD bd3 n/a +C AMD bd4 2.31 +C AMD zn1 1.69 +C AMD zn2 1.55 +C AMD zn3 1.36 +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL 2.08 +C Intel BWL 1.78 +C Intel SKL 1.78 +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * Perhaps avoid using jrcxz by using dec n + jnz. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') + +define(`tnc', `%r9') + +ifdef(`OPERATION_addlsh_n',` + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + mov (vp), %r10 + + mov R32(n), R32(%rax) + shr $3, n + xor R32(tnc), R32(tnc) + sub cnt, tnc + and $7, R32(%rax) + + lea L(tab)(%rip), %r11 +ifdef(`PIC',` + movslq (%r11,%rax,4), %rax + add %r11, %rax + jmp *%rax +',` + jmp *(%r11,%rax,8) +') + +L(0): lea 32(up), up + lea 32(vp), vp + lea 32(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e0) + +L(7): mov %r10, %r11 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e7) + +L(6): lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e6) + +L(5): mov %r10, %r11 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e5) + +L(end): ADCSBB 24(up), %rax + mov %rax, -40(rp) + shrx( tnc, %r11, %rax) + ADCSBB n, %rax + FUNC_EXIT() + ret + + ALIGN(32) +L(top): jrcxz L(end) + mov -32(vp), %r10 + ADCSBB 24(up), %rax + lea 64(up), up + shrx( tnc, %r11, %r11) + mov %rax, -40(rp) +L(e0): dec n + shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov -24(vp), %r11 + ADCSBB -32(up), %rax + shrx( tnc, %r10, %r10) + mov %rax, -32(rp) +L(e7): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov -16(vp), %r10 + ADCSBB -24(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, -24(rp) +L(e6): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov -8(vp), %r11 + ADCSBB -16(up), %rax + shrx( tnc, %r10, %r10) + mov %rax, -16(rp) +L(e5): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov (vp), %r10 + ADCSBB -8(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, -8(rp) +L(e4): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov 8(vp), %r11 + ADCSBB (up), %rax + shrx( tnc, %r10, %r10) + mov %rax, (rp) +L(e3): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov 16(vp), %r10 + ADCSBB 8(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, 8(rp) +L(e2): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov 24(vp), %r11 + ADCSBB 16(up), %rax + lea 64(vp), vp + shrx( tnc, %r10, %r10) + mov %rax, 16(rp) + lea 64(rp), rp +L(e1): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + jmp L(top) + +L(4): xor R32(%r11), R32(%r11) + jmp L(e4) + +L(3): mov %r10, %r11 + lea -8(up), up + lea -8(vp), vp + lea -8(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e3) + +L(2): lea -16(up), up + lea -16(vp), vp + lea -16(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e2) + +L(1): mov %r10, %r11 + lea -24(up), up + lea 40(vp), vp + lea 40(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e1) +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) diff --git a/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm b/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm new file mode 100644 index 0000000..89795e3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/aorsmul_1.asm @@ -0,0 +1,165 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 4.3 +C AMD zen 2 +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel PNR - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C VIA nano - + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rdx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`ADCSBB', `adc') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`ADCSBB', `sbb') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (up), %r8 + + push %rbx + push %r12 + push %r13 + + lea (up,n_param,8), up + lea -32(rp,n_param,8), rp + mov R32(n_param), R32(%rax) + xchg v0_param, v0 C FIXME: is this insn fast? + + neg n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jz L(b2) + jg L(b3) + +L(b1): mulx( %r8, %rbx, %rax) + sub $-1, n + jz L(wd1) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + test R32(%rax), R32(%rax) C clear cy + jmp L(lo1) + +L(b0): mulx( %r8, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + xor R32(%rax), R32(%rax) + jmp L(lo0) + +L(b3): mulx( %r8, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + sub $-3, n + jz L(wd3) + test R32(%rax), R32(%rax) C clear cy + jmp L(lo3) + +L(b2): mulx( %r8, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax + add %r12, %rbx + adc $0, %rax + sub $-2, n + jz L(wd2) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + test R32(%rax), R32(%rax) C clear cy + jmp L(lo2) + +L(top): ADDSUB %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + ADCSBB %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + ADCSBB %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + ADCSBB %rbx, 24(rp,n,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax C rax = carry limb + add $4, n + js L(top) + +L(end): ADDSUB %r9, (rp) +L(wd3): ADCSBB %r11, 8(rp) +L(wd2): ADCSBB %r13, 16(rp) +L(wd1): ADCSBB %rbx, 24(rp) + adc n, %rax + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/zen/com.asm b/gmp-6.3.0/mpn/x86_64/zen/com.asm new file mode 100644 index 0000000..b34f841 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/copyd.asm b/gmp-6.3.0/mpn/x86_64/zen/copyd.asm new file mode 100644 index 0000000..63ed237 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/copyi.asm b/gmp-6.3.0/mpn/x86_64/zen/copyi.asm new file mode 100644 index 0000000..1aafaaa --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm b/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm new file mode 100644 index 0000000..0ffb6ca --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/bd2/gcd_11.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm b/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm new file mode 100644 index 0000000..5dfd9e3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/gcd_22.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/coreihwl/gcd_22.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h new file mode 100644 index 0000000..05a12b3 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/gmp-mparam.h @@ -0,0 +1,280 @@ +/* AMD Zen gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3700-4300 MHz Pinnacle Ridge */ +/* FFT tuning limit = 468,514,360 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 32 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 338 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 107 +#define MUL_TOOM44_THRESHOLD 190 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 272 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 106 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 117 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 114 +#define SQR_TOOM4_THRESHOLD 422 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 540 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 540, 5}, { 22, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 9}, { 11, 8}, { 29, 9}, \ + { 15, 8}, { 35, 9}, { 19, 8}, { 43, 9}, \ + { 23, 8}, { 49, 9}, { 27,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 103,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 159,12}, { 95,11}, { 191,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343, 9}, { 2687,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,10}, \ + { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 927,11}, \ + { 1855,12}, { 959,11}, { 1919,10}, { 3839,13}, \ + { 511,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \ + { 2687,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \ + { 3455,13}, { 895,12}, { 1855,13}, { 959,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2687,13}, \ + { 5375,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4479,15}, \ + { 2303,14}, { 4991,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4095,14}, { 8191,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5375,14}, \ + { 11007,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 6911,16}, { 3583,15}, { 7167,14}, { 14335,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9215,14}, { 18431,15}, { 9727,14}, { 19455,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 11007,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 271 +#define MUL_FFT_THRESHOLD 6272 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 13, 4}, { 27, 5}, { 21, 6}, \ + { 11, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 14, 5}, { 29, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 25, 8}, \ + { 13, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,12}, \ + { 95,11}, { 191,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 399,10}, { 799,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 447,10}, { 895,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,10}, { 1471,13}, \ + { 191,12}, { 383,11}, { 767,10}, { 1535,11}, \ + { 799,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \ + { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1471,11}, { 2943,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \ + { 895,12}, { 1855,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2047,13}, { 4095,14}, \ + { 2175,13}, { 4479,12}, { 8959,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,14}, { 3967,16}, \ + { 1023,15}, { 2047,14}, { 4479,15}, { 2303,14}, \ + { 4991,15}, { 2559,14}, { 5247,15}, { 2815,14}, \ + { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \ + { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8191,15}, { 4351,14}, { 8959,15}, { 4863,14}, \ + { 9727,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7167,14}, \ + { 14335,15}, { 7679,14}, { 15359,15}, { 7935,14}, \ + { 15871,17}, { 2047,16}, { 4095,15}, { 8959,16}, \ + { 4607,15}, { 9215,14}, { 18431,15}, { 9727,14}, \ + { 19455,15}, { 9983,14}, { 19967,16}, { 5119,15}, \ + { 10239,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 6655,15}, { 13311,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 302 +#define SQR_FFT_THRESHOLD 4224 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 69 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 12 +#define SQRLO_DC_THRESHOLD 82 +#define SQRLO_SQR_THRESHOLD 8207 + +#define DC_DIV_QR_THRESHOLD 76 +#define DC_DIVAPPR_Q_THRESHOLD 232 +#define DC_BDIV_QR_THRESHOLD 76 +#define DC_BDIV_Q_THRESHOLD 104 + +#define INV_MULMOD_BNM1_THRESHOLD 37 +#define INV_NEWTON_THRESHOLD 274 +#define INV_APPR_THRESHOLD 230 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_N_THRESHOLD 68 + +#define MU_DIV_QR_THRESHOLD 1499 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 108 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define POWM_SEC_TABLE 3,22,81,494 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 486 +#define SET_STR_PRECOMPUTE_THRESHOLD 1264 + +#define FAC_DSC_THRESHOLD 187 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 23 +#define HGCD2_DIV1_METHOD 1 /* 9.20% faster than 3 */ +#define HGCD_THRESHOLD 109 +#define HGCD_APPR_THRESHOLD 104 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 566 +#define GCDEXT_DC_THRESHOLD 382 +#define JACOBI_BASE_METHOD 1 /* 15.55% faster than 3 */ + +/* Tuneup completed successfully, took 281243 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm b/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm new file mode 100644 index 0000000..48dcf61 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/hamdist.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/coreinhm/hamdist.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/lshift.asm b/gmp-6.3.0/mpn/x86_64/zen/lshift.asm new file mode 100644 index 0000000..4dce319 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/lshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm b/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm new file mode 100644 index 0000000..d52b194 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/lshiftc.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm b/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm new file mode 100644 index 0000000..6a083ac --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/mul_1.asm @@ -0,0 +1,161 @@ +dnl AMD64 mpn_mul_1 for CPUs with mulx. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 4.4 +C AMD zen 2 +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel PNR - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C VIA nano - + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() + ALIGN(16) +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + xor R32(%r8), R32(%r8) C carry-in limb +L(ent): mov (up), %r9 + + push %rbx + push %r12 + push %r13 + + lea (up,n_param,8), up + lea -32(rp,n_param,8), rp + mov R32(n_param), R32(%rax) + xchg v0_param, v0 C FIXME: is this insn fast? + + neg n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov %r8, %r12 + mulx( %r9, %rbx, %rax) + sub $-1, n + jz L(wd1) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + add %r12, %rbx + jmp L(lo1) + +L(b3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax + sub $-3, n + jz L(wd3) + add %r8, %r11 + jmp L(lo3) + +L(b2): mov %r8, %r10 C carry-in limb + mulx( %r9, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax + sub $-2, n + jz L(wd2) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + add %r10, %r13 + jmp L(lo2) + +L(b0): mov %r8, %rax C carry-in limb + mulx( %r9, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + add %rax, %r9 + jmp L(lo0) + +L(top): jrcxz L(end) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(top) + +L(end): mov %r9, (rp) +L(wd3): adc %r8, %r11 + mov %r11, 8(rp) +L(wd2): adc %r10, %r13 + mov %r13, 16(rp) +L(wd1): adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm new file mode 100644 index 0000000..affa3b6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/mul_basecase.asm @@ -0,0 +1,455 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD Zen. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Try 2x unrolling instead of current 4x, at least for mul_1. Else consider +C shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 or 6 instead +C of 8 product registers. +C * Split up mul_1 into 4 loops in order to fall into the addmul_1 loops +C without branch tree. +C * Improve the overlapped software pipelining. The mulx in the osp block now +C suffers from write/read conflicts, in particular the 1 mod 4 case. Also, +C mul_1 could osp into addmul_1. +C * Let vn_param be vn to save a copy. +C * Re-allocate to benefit more from 32-bit encoding. +C * Poor performance for e.g. n = 12,16. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp_param', `%rcx') +define(`vn_param', `%r8') + +define(`un', `%r14') +define(`vp', `%rbp') +define(`v0', `%rdx') +define(`n', `%rcx') +define(`vn', `%r15') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + cmp $2, un_param + ja L(gen) + mov (vp_param), %rdx + mulx( (up), %rax, %r9) C 0 1 + je L(s2x) + +L(s11): mov %rax, (rp) + mov %r9, 8(rp) + FUNC_EXIT() + ret + +L(s2x): cmp $2, vn_param + mulx( 8,(up), %r8, %r10) C 1 2 + je L(s22) + +L(s21): add %r8, %r9 + adc $0, %r10 + mov %rax, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + FUNC_EXIT() + ret + +L(s22): add %r8, %r9 C 1 + adc $0, %r10 C 2 + mov 8(vp_param), %rdx + mov %rax, (rp) + mulx( (up), %r8, %r11) C 1 2 + mulx( 8,(up), %rax, %rdx) C 2 3 + add %r11, %rax C 2 + adc $0, %rdx C 3 + add %r8, %r9 C 1 + adc %rax, %r10 C 2 + adc $0, %rdx C 3 + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + +L(gen): push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov un_param, un + mov vp_param, vp + mov vn_param, vn + + mov (up), %r9 + mov (vp), v0 + + lea (up,un,8), up + lea -32(rp,un,8), rp + + neg un + mov un, n + test $1, R8(un) + jz L(mx0) +L(mx1): test $2, R8(un) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + inc n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + sub $-3, n + jz L(mwd3) + test R32(%rdx), R32(%rdx) + jmp L(mlo3) + +L(mx0): test $2, R8(un) + jz L(mb0) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + lea 2(n), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + jmp L(mlo2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12 + jmp L(mlo0) + +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 +L(mwd3):mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + mov %rax, 32(rp) + add $8, vp + dec vn + jz L(end) + +C The rest of the file are 4 osp loops around addmul_1 + + test $1, R8(un) + jnz L(0x1) + +L(0x0): test $2, R8(un) + jnz L(oloop2_entry) + +L(oloop0_entry): + C initial feed-in block + mov (vp), %rdx + add $8, vp + mov un, n + add $8, rp + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 24(up,un,8), %rbx, %rax + add %r8, %r11 + jmp L(lo0) + +L(oloop0): + C overlapped software pipelining block + mov (vp), %rdx C new + add $8, vp + add %r9, (rp) C prev + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8 + adc %r11, 8(rp) C prev + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 0x8(%rsi,%r14,8),%r11,%r10 + adc %r13, 16(rp) C prev + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 0x10(%rsi,%r14,8),%r13,%r12 + adc %rbx, 24(rp) C prev + mov un, n + adc $0, %rax C prev + mov %rax, 32(rp) C prev + add $8, rp + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%rbx,%rax + add %r8, %r11 C new + jmp L(lo0) + + ALIGN(16) +L(tp0): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 +L(lo0): adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp0) + + dec vn + jne L(oloop0) + + jmp L(final_wind_down) + +L(oloop2_entry): + mov (vp), %rdx + add $8, vp + lea 2(un), n + add $8, rp + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + add %r13, 16(rp,n,8) + jmp L(lo2) + +L(oloop2): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + lea 2(un), n + add $8, rp + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r11,%r10 + add %r13, 16(rp,n,8) + jmp L(lo2) + + ALIGN(16) +L(tp2): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(lo2): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp2) + + dec vn + jne L(oloop2) + + jmp L(final_wind_down) + +L(0x1): test $2, R8(un) + jz L(oloop3_entry) + +L(oloop1_entry): + mov (vp), %rdx + add $8, vp + lea 1(un), n + add $8, rp + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + add %rbx, 24(rp,n,8) + jmp L(lo1) + +L(oloop1): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + adc %r11, 8(rp) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r13,%r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax + lea 1(un), n + add $8, rp + add %rbx, 24(rp,n,8) + jmp L(lo1) + + ALIGN(16) +L(tp1): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) +L(lo1): adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp1) + + dec vn + jne L(oloop1) + + jmp L(final_wind_down) + +L(oloop3_entry): + mov (vp), %rdx + add $8, vp + lea 3(un), n + add $8, rp + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + test n, n + jz L(wd3) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + add %r11, 8(rp,n,8) + jmp L(lo3) + +L(oloop3): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + adc %r11, 8(rp) + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + lea 3(un), n + add $8, rp + add %r10, %r13 + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r12, %rbx + adc $0, %rax + add %r11, 8(rp,n,8) + jmp L(lo3) + + ALIGN(16) +L(tp3): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(lo3): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp3) + + dec vn + jne L(oloop3) + +L(final_wind_down): + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(end): pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(3): mov (vp), %rdx + add $8, vp + add $8, rp + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax +L(wd3): adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + dec vn + jne L(3) + jmp L(end) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm new file mode 100644 index 0000000..2ae729a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/mullo_basecase.asm @@ -0,0 +1,299 @@ +dnl X64-64 mpn_mullo_basecase optimised for AMD Zen. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`nn', `%rbp') + +C TODO +C * Rearrange feed-in jumps for short branch forms. +C * Roll out the heavy artillery and 4-way unroll outer loop. Since feed-in +C code implodes, the blow-up will not be more than perhaps 2.5x. +C * Micro-optimise critical lead-in code blocks. +C * Clean up register use, e.g. r15 vs vp, disuse of nn, etc. +C * Write n < 4 code specifically for Zen (current code is for Haswell). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, R32(n) + jae L(big) + + mov vp_param, vp + mov (up), %rdx + + cmp $2, R32(n) + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r8 + mov (up), %rdx + mulx( %r8, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r8 C u1 x v1 + add %r8, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(big): push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov (up), %r9 + lea -8(up,n,8), up + lea -40(rp,n,8), rp + + mov $4, R32(%r14) + sub n, %r14 + mov -8(vp_param,n,8), %rbp + imul %r9, %rbp + lea 8(vp_param), %r15 + mov (vp_param), %rdx + + test $1, R8(%r14) + jnz L(mx0) +L(mx1): test $2, R8(%r14) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + lea -2(%r14), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10 + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax + lea (%r14), n + jrcxz L(x) + jmp L(mlo3) +L(x): jmp L(mcor) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%rbx,%rax + lea -1(%r14), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r9,%r8 + jmp L(mlo2) + +L(mx0): test $2, R8(%r14) + jz L(mb2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12 + lea -3(%r14), n + jmp L(mlo0) + + ALIGN(16) +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 + mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + mov %rbx, 24(rp) + +L(outer): + mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov (%r15), %rdx + add $8, %r15 + mov -24(up,%r14,8), %r8 + lea -8(up), up + + test $1, R8(%r14) + jz L(x0) +L(x1): test $2, R8(%r14) + jnz L(b3) + +L(b1): mulx( %r8, %rbx, %rax) + lea -1(%r14), n + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (%rsi,%rcx,8),%r9,%r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 0x8(%rsi,%rcx,8),%r11,%r10 + jmp L(lo1) + +L(x0): test $2, R8(%r14) + jz L(b2) + +L(b0): mulx( %r8, %r9, %r8) + lea -2(%r14), n + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (%rsi,%r14,8),%r13,%r12 + jmp L(lo0) + +L(b3): mulx( %r8, %r11, %r10) + lea 1(%r14), n + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%rbx,%rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jrcxz L(cor) + jmp L(lo3) + +L(cor): add 8(rp), %r11 + mov 16(rp), %r10 + mov 24(rp), %r12 +L(mcor):mov %r11, 8(rp) + adc %r10, %r13 + adc %r12, %rbx + mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov (%r15), %rdx + mov -24(up), %r8 + mulx( %r8, %r9, %r12) + mulx( -16,(up), %r14, %rax) + add %r12, %r14 + adc $0, %rax + adc %r9, %r13 + mov %r13, 16(rp) + adc %r14, %rbx + mulx( -8,(up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov 8(%r15), %rdx + mulx( -24,(up), %r14, %rax) + add %r14, %rbx + mov %rbx, 24(rp) + mulx( -16,(up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov %rbp, 32(rp) + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(b2): mulx( %r8, %r13, %r12) + lea (%r14), n + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8 + jmp L(lo2) + + ALIGN(16) +L(top): add %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + js L(top) + + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + inc %r14 + jmp L(outer) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/zen/popcount.asm b/gmp-6.3.0/mpn/x86_64/zen/popcount.asm new file mode 100644 index 0000000..be1613b --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/popcount.asm @@ -0,0 +1,38 @@ +dnl AMD64 mpn_popcount -- population count. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/coreinhm/popcount.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/rshift.asm b/gmp-6.3.0/mpn/x86_64/zen/rshift.asm new file mode 100644 index 0000000..0196870 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/rshift.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm new file mode 100644 index 0000000..0c24de5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/sbpi1_bdiv_r.asm @@ -0,0 +1,507 @@ +dnl AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(`up', `%rdi') +define(`un_param', `%rsi') +define(`dp_param', `%rdx') +define(`dn_param', `%rcx') +define(`dinv', `%r8') + +define(`i', `%rcx') +define(`dn', `%r14') + +define(`dp', `%rsi') +define(`un', `%r15') + +C TODO +C * The o1...o8 loops for special dn counts were naively hand-optimised by +C folding the generic loops. They can probably be tuned. The speculative +C quotient limb generation might not be in the optimal spot. +C * Perhaps avoid late-in-loop jumps, e.g., lo0. +C * Improve regalloc wrt dn_param/dn and un_param/un to save some moves. + +C ABI_SUPPORT(DOS64) +C ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sbpi1_bdiv_r) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), dinv ') + push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + sub dn_param, un_param C outer loop count + mov dn_param, dn C FIXME: Suppress by reg re-alloc + push dinv C keep dinv on stack + mov un_param, un C FIXME: Suppress by reg re-alloc + xor R32(%rbp), R32(%rbp) + + lea (dp_param,dn_param,8), dp + + mov (up), %rdx + imul dinv, %rdx C first quotient limb + + neg dn + lea -32(up,dn_param,8), up + + test $1, R8(dn_param) + jnz L(cx1) + +L(cx0): test $2, R8(dn_param) + jnz L(b2) + + +C ============================================================================= +L(b0): cmp $-4, dn + jnz L(gt4) + +L(o4): mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add (up), %r9 + adc 8(up), %r11 + mov %r8, %rdx C dinv + mov %r11, 8(up) + mulx( %r11, %rdx, %r12) C next quotient + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o4) + jmp L(ret) + +L(gt4): cmp $-8, dn + jnz L(out0) + +L(o8): mulx( -64,(dp), %r9, %r14) + mulx( -56,(dp), %rcx, %r10) + mulx( -48,(dp), %r13, %r12) + mulx( -40,(dp), %rbx, %rax) + add %r14, %rcx + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add -32(up), %r9 + mulx( -32,(dp), %r9, %r14) + adc -24(up), %rcx + mov %rcx, -24(up) + mulx( -24,(dp), %r11, %r10) + adc %r13, -16(up) + mulx( -16,(dp), %r13, %r12) + adc %rbx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o8) + jmp L(ret) + +L(out0):mov dn, i + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(dp,dn,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(dp,dn,8),%r13,%r12 + clc + jmp L(lo0) + + ALIGN(16) +L(top0):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top0) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(%rdi,%r14,8),%rdx,%r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out0) + jmp L(ret) + +L(cx1): test $2, R8(dn_param) + jnz L(b3) + +C ============================================================================= +L(b1): cmp $-1, dn + jnz L(gt1) + + mov 24(up), %r9 +L(o1): mulx( -8,(dp), %rbx, %rdx) + add %r9, %rbx + adc %rbp, %rdx + add 32(up), %rdx + setc R8(%rbp) + mov %rdx, %r9 + mulx( %r8, %rdx, %r12) C next quotient + lea 8(up), up + dec un + jne L(o1) + mov %r9, 24(up) + jmp L(ret) + +L(gt1): cmp $-5, dn + jnz L(out1) + +L(o5): mulx( -40,(dp), %rbx, %rax) + mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + add -8(up), %rbx + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add (up), %r9 + mov %r9, (up) + mov %r8, %rdx C dinv + mulx( %r9, %rdx, %r12) C next quotient + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o5) + jmp L(ret) + +L(out1):lea 1(dn), i + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%rbx,%rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(dp,dn,8),%r11,%r10 + clc + jmp L(lo1) + + ALIGN(16) +L(top1):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top1) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out1) + jmp L(ret) + +C ============================================================================= +L(b2): cmp $-2, dn + jnz L(gt2) + + mov 16(up), %r10 + mov 24(up), %r9 +L(o2): mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r12, %rbx + adc $0, %rax + add %r10, %r13 C 0 add just to produce carry + mov %r9, %r10 C 1 + adc %rbx, %r10 C 1 + mov %r8, %rdx + mulx( %r10, %rdx, %r12) C next quotient + adc %rbp, %rax C 2 + setc R8(%rbp) C 3 + mov 32(up), %r9 C 2 + add %rax, %r9 C 2 + adc $0, R32(%rbp) C 3 + lea 8(up), up + dec un + jne L(o2) + mov %r10, 16(up) + mov %r9, 24(up) + jmp L(ret) + +L(gt2): cmp $-6, dn + jnz L(out2) + +L(o6): mulx( -48,(dp), %r13, %r12) + mulx( -40,(dp), %rcx, %rax) + add %r12, %rcx + adc $0, %rax + mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + add -16(up), %r13 + mulx( -16,(dp), %r13, %r12) + adc -8(up), %rcx + mov %rcx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o6) + jmp L(ret) + +L(out2):lea 2(dn), i + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (dp,dn,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%rbx,%rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%r9,%r8 + jmp L(lo2) + + ALIGN(16) +L(top2):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top2) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out2) + jmp L(ret) + +C ============================================================================= +L(b3): cmp $-3, dn + jnz L(gt3) + + mov 8(up), %r14 + mov 16(up), %r9 + mov 24(up), %rcx +L(o3): mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add %r14, %r11 + mov %r9, %r14 + adc %r13, %r14 + mov %rcx, %r9 + mov %r8, %rdx C dinv + mulx( %r14, %rdx, %r12) C next quotient + adc %rbx, %r9 + adc %rbp, %rax + setc R8(%rbp) + mov 32(up), %rcx + add %rax, %rcx + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o3) + mov %r14, 8(up) + mov %r9, 16(up) + mov %rcx, 24(up) + jmp L(ret) + +L(gt3): cmp $-7, dn + jnz L(out3) + +L(o7): mulx( -56,(dp), %r11, %r10) + mulx( -48,(dp), %rcx, %r12) + mulx( -40,(dp), %rbx, %rax) + add %r10, %rcx + adc %r12, %rbx + adc $0, %rax + mulx( -32,(dp), %r9, %r14) + add -24(up), %r11 + mulx( -24,(dp), %r11, %r10) + adc -16(up), %rcx + mov %rcx, -16(up) + mulx( -16,(dp), %r13, %r12) + adc %rbx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o7) + jmp L(ret) + +L(out3):lea 3(dn), i + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (dp,dn,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(dp,dn,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%rbx,%rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jmp L(lo3) + + ALIGN(16) +L(top3):add %r9, (up,i,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top3) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out3) + +L(ret): mov %rbp, %rax + pop %rsi C dummy dealloc + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm new file mode 100644 index 0000000..a7c6127 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/sqr_basecase.asm @@ -0,0 +1,482 @@ +dnl AMD64 mpn_sqr_basecase optimised for AMD Zen. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Do overlapped software pipelining. This should close the remaining gap to +C mul_basecase. +C +C * Update un just once in the outer loop. +C +C * Perhaps keep un and n pre-multiplied by 8, thus suppressing ",8" from +C loads and stores. At least in some cases, the non-scaled form is faster. +C +C * Optimise xit3 code, e.g., using shrx and sarx like in the main loop. +C +C * The mul_1 feed-in code has gotten little attention and could probably be +C improved. Perhaps even expand it to 4 separate loops to allow straight +C fall-through into the 4 addmul_1 loops. +C +C * Clean up ad-hoc scratch register usage in the addmul_1 feed-in code blocks. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + +define(`un', `%rbp') +define(`n', `%rcx') + +C these are used just for the small op code +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, R32(un_param) + jae L(gt1) + + mov (up), %rdx + mulx( %rdx, %rax, %rdx) + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rdx + mov 8(up), %rcx + mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 + mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 + mov %rcx, %rdx + mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 + add %r9, %r9 C W 1 + adc %r10, %r10 C W 2 + adc $0, %rdx C W 3 + add %r9, %r8 C W 1 + adc %r11, %r10 C W 2 + adc $0, %rdx C W 3 + mov %rax, (rp) + mov %r8, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, R32(un_param) + jae L(gt3) + + push %rbx + mov (up), %rdx + mulx( 8,(up), w2, w3) + mulx( 16,(up), w0, w1) + add w3, w0 + mov 8(up), %rdx + mulx( 16,(up), %rax, w3) + adc %rax, w1 + adc $0, w3 + test R32(%rbx), R32(%rbx) + mov (up), %rdx + mulx( %rdx, %rbx, %rcx) + mov %rbx, (rp) + mov 8(up), %rdx + mulx( %rdx, %rax, %rbx) + mov 16(up), %rdx + mulx( %rdx, %rsi, %rdx) + adcx( w2, w2) + adcx( w0, w0) + adcx( w1, w1) + adcx( w3, w3) + adox( w2, %rcx) + adox( w0, %rax) + adox( w1, %rbx) + adox( w3, %rsi) + mov $0, R32(%r8) + adox( %r8, %rdx) + adcx( %r8, %rdx) + mov %rcx, 8(rp) + mov %rax, 16(rp) + mov %rbx, 24(rp) + mov %rsi, 32(rp) + mov %rdx, 40(rp) + pop %rbx + FUNC_EXIT() + ret + +L(gt3): push %r15 +C push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + mov R32(un_param), R32(un) + + mov (up), %rdx C up[0] + mov 8(up), %r9 C up[1] + + mulx( %rdx, %rax, %r15) C up[0]^2 + mov %rax, (rp) + shl %rdx + + lea (up,un,8), up + lea -32(rp,un,8), rp + + neg un + lea 4(un), n + and $-4, n + + test $1, R8(un) + jnz L(mx0) +L(mx1): test $2, R8(un) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x18 C mulx 24(up,un,8), %r11, %r10 + add %r15, %rbx + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x10 C mulx 16(up,un,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %rbx, %rax + add %r15, %r11 + jrcxz L(n4) + jmp L(mlo3) +L(n4): mov %r11, 8(rp) + adc %r10, %r13 + adc %r12, %rbx + jmp L(m) + +L(mx0): test $2, R8(un) + jnz L(mb0) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %rbx, %rax + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %r9, %r8 + add %r15, %r13 + jmp L(mlo2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x10 C mulx 16(up,un,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x18 C mulx 24(up,un,8), %r13, %r12 + add %r15, %r9 + jmp L(mlo0) + + ALIGN(16) +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 + mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + mov %rax, 32(rp) + + lea 2(un), un + + mov $63, R32(%r15) C keep at 63 for shrx/sarx. + test $1, R8(un) + jz L(x0) +L(x1): test $2, R8(un) + jz L(f3) + jmp L(f1) +L(x0): test $2, R8(un) + jz L(f0) +C jmp L(f2) + +L(f2): mov -8(up,un,8), %rdx C up[0] + lea 2(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r11 + .byte 0xc4,0x62,0x83,0xf7,0x6c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r13 + and %rdx, %r11 C "ci" in C code + mulx( %rdx, %rax, %r10) C up[0]^2 + lea (%r13,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r11 + + .byte 0xc4,0x62,0x93,0xf6,0x24,0xee C mulx (up,un,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %rbx, %rax + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jmp L(b2) + + ALIGN(16) +L(top2):add %r9, (rp,n,8) +L(b2): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top2) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f1): mov -8(up,un,8), %rdx C up[0] + lea 1(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x6c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r13 + .byte 0xc4,0xe2,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %rbx + and %rdx, %r13 C "ci" in C code + mulx( %rdx, %rax, %r12) C up[0]^2 + lea (%rbx,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r13 + + .byte 0xc4,0xe2,0xe3,0xf6,0x04,0xee C mulx (up,un,8), %rbx, %rax + adc %r12, %rbx + adc $0, %rax + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %r9, %r8 + jmp L(b1) + + ALIGN(16) +L(top1):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(b1): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top1) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f0): mov -8(up,un,8), %rdx C up[0] + lea (un), n + lea 8(rp), rp + .byte 0xc4,0xe2,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %rbx + .byte 0xc4,0x62,0x83,0xf7,0x4c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r9 + and %rdx, %rbx C "ci" in C code + mulx( %rdx, %r10, %rax) C up[0]^2 + lea (%r9,%rdx,2), %rdx C "u0" arg in C code + add %r10, %rbx + adc $0, %rax C "cin" in C code + + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08 C mulx 8(up,un,8), %r11, %r10 + jmp L(b0) + + ALIGN(16) +L(top0):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(b0): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top0) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f3): mov -8(up,un,8), %rdx C up[0] + lea 3(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x4c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r9 + .byte 0xc4,0x62,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r11 + and %rdx, %r9 C "ci" in C code + mulx( %rdx, %rax, %r8) C up[0]^2 + lea (%r11,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r9 + + .byte 0xc4,0x62,0xa3,0xf6,0x14,0xee C mulx (%rsi,%rbp,8),%r11,%r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x08 C mulx 0x8(%rsi,%rbp,8),%r13,%r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 0x10(%rsi,%rbp,8),%rbx,%rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jrcxz L(xit3) + jmp L(top3) C FIXME perhaps fall through + + ALIGN(16) +L(top3):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top3) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + jmp L(f2) + + +L(xit3):add %r9, (rp) + adc %r11, 8(rp) + adc 16(rp), %r13 + adc 24(rp), %rbx +L(m): adc $0, %rax + mov %rax, 32(rp) + mov -24(up), %rdx C FIXME: CSE + mov -32(up), %r9 C FIXME: CSE + sar $63, %r9 + and %rdx, %r9 + add %r13, %r9 + mulx( %rdx, %rax, %r10) + mov -16(up), %r8 C FIXME: CSE + adc $0, %r10 + add %rax, %r9 + adc $0, %r10 + mov %r9, 16(rp) + mov -32(up), %rax + shl %rax + adc %rdx, %rdx + mulx( %r8, %r13, %r12) + mulx( -8,(up), %r11, %rax) C FIXME: CSE + add %r10, %r13 + adc %r12, %r11 + adc $0, %rax + add %rbx, %r13 + mov %r13, 24(rp) + adc 32(rp), %r11 + adc $0, %rax + mov -16(up), %rdx C FIXME: CSE + mov -8(up), %r8 C FIXME: CSE + mov -24(up), %r9 + sar $63, %r9 + and %rdx, %r9 + add %r11, %r9 + mulx( %rdx, %rbp, %r10) + adc $0, %r10 + add %rbp, %r9 + adc $0, %r10 + mov %r9, 32(rp) + mov -24(up), %rbp + shl %rbp + adc %rdx, %rdx + mulx( %r8, %rbx, %rbp) + add %r10, %rbx + adc $0, %rbp + adc %rbx, %rax + mov %rax, 40(rp) + adc $0, %rbp + mov -8(up), %rdx C FIXME: CSE + mov -16(up), %r9 C FIXME: CSE + sar $63, %r9 + and %rdx, %r9 + add %rbp, %r9 + mulx( %rdx, %rbp, %r10) + adc $0, %r10 + add %rbp, %r9 + adc $0, %r10 + mov %r9, 48(rp) + mov %r10, 56(rp) + + pop %rbx + pop %rbp + pop %r12 + pop %r13 +C pop %r14 + pop %r15 + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm b/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm new file mode 100644 index 0000000..00f6dc9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen/sublsh1_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sublsh1_n, mpn_sublsh1_nc. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc) +include_mpn(`x86_64/atom/sublsh1_n.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h new file mode 100644 index 0000000..3748c5f --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen2/gmp-mparam.h @@ -0,0 +1,276 @@ +/* AMD Zen2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3600-4400 MHz Matisse */ +/* FFT tuning limit = 703,392,483 */ +/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 27 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 1 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 13 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 385 + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 125 +#define MUL_TOOM44_THRESHOLD 196 +#define MUL_TOOM6H_THRESHOLD 276 +#define MUL_TOOM8H_THRESHOLD 369 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 132 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 185 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 315 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 38 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 20 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,12}, { 95,11}, { 191,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \ + { 1343,12}, { 351,11}, { 703,10}, { 1407,11}, \ + { 735,13}, { 191,12}, { 383,11}, { 767,10}, \ + { 1535,11}, { 799,12}, { 415,11}, { 831,10}, \ + { 1663,12}, { 447,11}, { 895,12}, { 479,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,10}, { 2431,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,10}, { 2687,12}, \ + { 703,11}, { 1471,10}, { 2943,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 959,11}, \ + { 1919,10}, { 3839,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1471,11}, { 2943,14}, { 383,13}, \ + { 767,12}, { 1599,11}, { 3199,13}, { 831,12}, \ + { 1727,13}, { 895,12}, { 1791,13}, { 959,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1727,12}, { 3455,14}, { 895,13}, { 1919,12}, \ + { 3839,11}, { 7679,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1919,13}, \ + { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,12}, { 8959,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 3839,13}, \ + { 7679,14}, { 3967,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,13}, \ + { 15359,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,17}, { 2047,16}, \ + { 4095,15}, { 8959,16}, { 4607,15}, { 9983,14}, \ + { 19967,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 7679,15}, { 15871,18}, { 2047,17}, { 4095,16}, \ + { 9727,15}, { 19967,17}, { 5119,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 275 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 396, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \ + { 895,12}, { 479,11}, { 959,14}, { 127,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \ + { 1407,12}, { 735,11}, { 1471,10}, { 2943,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \ + { 1599,12}, { 831,11}, { 1663,13}, { 447,12}, \ + { 959,11}, { 1919,10}, { 3839,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1471,11}, { 2943,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \ + { 3455,13}, { 959,12}, { 1919,11}, { 3839,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,11}, { 4863,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1471,12}, \ + { 2943,11}, { 5887,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1727,12}, { 3455,14}, { 895,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1919,13}, \ + { 3839,12}, { 7679,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,14}, { 3967,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,14}, { 5887,13}, \ + { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4095,14}, { 8191,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 7679,15}, \ + { 15359,18}, { 2047,17}, { 4095,16}, { 9727,15}, \ + { 19967,17}, { 5119,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 282 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 57 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 154 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 93 + +#define INV_MULMOD_BNM1_THRESHOLD 36 +#define INV_NEWTON_THRESHOLD 141 +#define INV_APPR_THRESHOLD 149 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_N_THRESHOLD 47 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 47 +#define MU_BDIV_QR_THRESHOLD 1187 +#define MU_BDIV_Q_THRESHOLD 1589 + +#define POWM_SEC_TABLE 3,22,194,579 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 195 +#define SET_STR_PRECOMPUTE_THRESHOLD 1752 + +#define FAC_DSC_THRESHOLD 345 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 24 +#define HGCD2_DIV1_METHOD 1 /* 11.29% faster than 3 */ +#define HGCD_THRESHOLD 89 +#define HGCD_APPR_THRESHOLD 96 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 1 /* 25.56% faster than 4 */ + +/* Tuneup completed successfully, took 294200 seconds */ diff --git a/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm b/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm new file mode 100644 index 0000000..7c1ecd0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/addmul_1.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_addmul_1. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1) +include_mpn(`x86_64/coreibwl/addmul_1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h b/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h new file mode 100644 index 0000000..ffba1c5 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/gmp-mparam.h @@ -0,0 +1,222 @@ +/* AMD Zen3 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3800-4700 MHz Vermeer */ +/* FFT tuning limit = 10,000,000 */ +/* Generated by tuneup.c, 2021-01-01, gcc 9.3 */ + +#define MOD_1_NORM_THRESHOLD 64 +#define MOD_1_UNNORM_THRESHOLD 85 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 9 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 15 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 18 + +#define DIV_1_VS_MUL_1_PERCENT 398 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 89 +#define MUL_TOOM44_THRESHOLD 130 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 418 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 87 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 94 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 109 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 414 +#define SQR_TOOM8_THRESHOLD 592 + +#define MULMID_TOOM42_THRESHOLD 38 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 332 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 332, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 35, 8}, \ + { 73, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319, 8}, { 639,10}, { 175, 9}, \ + { 351,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207,11}, { 111,10}, { 223,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303, 9}, { 607,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 175,10}, { 351,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,11}, \ + { 223,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,12}, \ + { 287,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,10}, { 2687,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 895,11}, \ + { 1791,12}, { 959,11}, { 1919,10}, { 3839,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1279, 8}, { 24063,10}, { 6399,11}, \ + { 3327,13}, { 895,12}, { 1791,13}, { 959,12}, \ + { 1919,11}, { 3839,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,12}, { 2431,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 186 +#define MUL_FFT_THRESHOLD 3264 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303, 9}, { 607,11}, { 159,10}, \ + { 319, 9}, { 639,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,12}, { 223,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 607,11}, \ + { 1215,13}, { 319,12}, { 671,11}, { 1343,12}, \ + { 735,13}, { 383,12}, { 799,11}, { 1599,10}, \ + { 3199,12}, { 831,13}, { 447,12}, { 895,11}, \ + { 1791,12}, { 959,11}, { 1919,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1343,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,13}, { 959,12}, { 1919,15}, \ + { 255,14}, { 511,13}, { 1023, 9}, { 17919,10}, \ + { 9727,12}, { 4096,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 161 +#define SQR_FFT_THRESHOLD 2624 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 31 +#define MULLO_MUL_N_THRESHOLD 6440 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 129 +#define SQRLO_SQR_THRESHOLD 5103 + +#define DC_DIV_QR_THRESHOLD 19 +#define DC_DIVAPPR_Q_THRESHOLD 123 +#define DC_BDIV_QR_THRESHOLD 79 +#define DC_BDIV_Q_THRESHOLD 154 + +#define INV_MULMOD_BNM1_THRESHOLD 42 +#define INV_NEWTON_THRESHOLD 107 +#define INV_APPR_THRESHOLD 107 + +#define BINV_NEWTON_THRESHOLD 312 +#define REDC_1_TO_REDC_N_THRESHOLD 77 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1258 +#define MUPI_DIV_QR_THRESHOLD 30 +#define MU_BDIV_QR_THRESHOLD 1120 +#define MU_BDIV_Q_THRESHOLD 1394 + +#define POWM_SEC_TABLE 6,19,203,579,2245 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 115 +#define SET_STR_PRECOMPUTE_THRESHOLD 1941 + +#define FAC_DSC_THRESHOLD 182 +#define FAC_ODD_THRESHOLD 44 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD2_DIV1_METHOD 1 /* 13.04% faster than 3 */ +#define HGCD_THRESHOLD 65 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 293 +#define GCDEXT_DC_THRESHOLD 186 +#define JACOBI_BASE_METHOD 1 /* 12.79% faster than 3 */ diff --git a/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm b/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm new file mode 100644 index 0000000..6f1e286 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/mul_1.asm @@ -0,0 +1,208 @@ +dnl AMD64 mpn_mul_1 optimised for Intel Broadwell. + +dnl Copyright 2015, 2017, 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 n/a +C AMD bd3 n/a +C AMD bd4 ? +C AMD zn1 ? +C AMD zn2 1.6 +C AMD zn3 1.5 +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel WSM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL ? +C Intel SKL ? +C Intel atom n/a +C Intel SLM n/a +C Intel GLM n/a +C VIA nano n/a + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Put an initial mulx before switching, targeting some free registers. +C * Tune feed-in code. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 +define(`ci', `%r8') C stack + +define(`n', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl IFDOS(` define(`up', ``%rsi'') ') dnl +dnl IFDOS(` define(`rp', ``%rcx'') ') dnl +dnl IFDOS(` define(`vl', ``%r9'') ') dnl +dnl IFDOS(` define(`r9', ``rdi'') ') dnl +dnl IFDOS(` define(`n', ``%r8'') ') dnl +dnl IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r11 ') +IFSTD(` mov %r8, %r11 ') + jmp L(com) +EPILOGUE() + +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + xor R32(%r11), R32(%r11) +L(com): + mov v0_param, %r10 + mov n_param, n + mov R32(n_param), R32(%rax) + shr $3, n + and $7, R32(%rax) C clear OF, CF as side-effect + mov %r10, %rdx + lea L(tab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(f0), L(tab)) + JMPENT( L(f1), L(tab)) + JMPENT( L(f2), L(tab)) + JMPENT( L(f3), L(tab)) + JMPENT( L(f4), L(tab)) + JMPENT( L(f5), L(tab)) + JMPENT( L(f6), L(tab)) + JMPENT( L(f7), L(tab)) + TEXT + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea -1(n), n + adc %r11, %r10 + jmp L(b0) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + adc %r11, %r9 + jmp L(b3) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + adc %r11, %r10 + jmp L(b4) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + adc %r11, %r9 + jmp L(b5) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + adc %r11, %r10 + jmp L(b6) + +L(f1): mulx( (up), %r9, %rax) + adc %r11, %r9 + jrcxz L(end) + jmp L(b1) + +L(end): mov %r9, (rp) + adc %rcx, %rax C relies on rcx = 0 + FUNC_EXIT() + ret + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + adc %r11, %r10 + + ALIGN(32) +L(top): adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + lea -1(n), n + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + adc %r11, %r9 + jmp L(b7) +EPILOGUE() +ASM_END() diff --git a/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm new file mode 100644 index 0000000..f8c1b60 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/mul_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_mul_basecase. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86_64/coreibwl/mul_basecase.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm b/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm new file mode 100644 index 0000000..c9c3487 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/sbpi1_bdiv_r.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sbpi1_bdiv_r. + +dnl Copyright 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sbpi1_bdiv_r) +include_mpn(`x86_64/coreibwl/sbpi1_bdiv_r.asm') diff --git a/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm b/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm new file mode 100644 index 0000000..9c4f65d --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/zen3/sqr_basecase.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sqr_basecase. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86_64/coreibwl/sqr_basecase.asm') diff --git a/gmp-6.3.0/mpn/xnor_n.c b/gmp-6.3.0/mpn/xnor_n.c new file mode 120000 index 0000000..0a553d9 --- /dev/null +++ b/gmp-6.3.0/mpn/xnor_n.c @@ -0,0 +1 @@ +../mpn/generic/logops_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/xor_n.c b/gmp-6.3.0/mpn/xor_n.c new file mode 120000 index 0000000..0a553d9 --- /dev/null +++ b/gmp-6.3.0/mpn/xor_n.c @@ -0,0 +1 @@ +../mpn/generic/logops_n.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/zero.c b/gmp-6.3.0/mpn/zero.c new file mode 120000 index 0000000..e0ba5ec --- /dev/null +++ b/gmp-6.3.0/mpn/zero.c @@ -0,0 +1 @@ +../mpn/generic/zero.c \ No newline at end of file diff --git a/gmp-6.3.0/mpn/zero_p.c b/gmp-6.3.0/mpn/zero_p.c new file mode 120000 index 0000000..3468aef --- /dev/null +++ b/gmp-6.3.0/mpn/zero_p.c @@ -0,0 +1 @@ +../mpn/generic/zero_p.c \ No newline at end of file -- cgit v1.2.3